#!/usr/bin/perl # mcClassifier.pl - Sam Shepard - 11.2009 # Homogeneous markov chain classifier. # Contact sammysheep@gmail.com # LICENSE: GPL version 3 or later # GET the input parameters. if ( scalar(@ARGV) < 3 ) { $message = "\nUsage:\n\tperl $0 "; $message .= " [frame]\n"; die( $message ); } # PROCESS parameters if ( scalar(@ARGV) > 3 ) { $frameshift = $ARGV[3]; } else { $frameshift = 0; } # Trim function. # Removes whitespace from the start and end of the string sub trim($) { my $string = shift; $string =~ /^\s*(.*?)\s*$/; return $1; } # read samples open( ITRAIN, '<', "$ARGV[0]/training_intron.fa") or die("Cannot open $ARGV[0]/training_intron.fa\n"); open( ETRAIN, '<', "$ARGV[0]/training_exon.fa") or die("Cannot open $ARGV[0]/training_exon.fa\n"); open( ITEST, '<', "$ARGV[0]/test_intron.fa") or die("Cannot open $ARGV[0]/test_intron.fa\n"); open( ETEST, '<', "$ARGV[0]/test_exon.fa") or die("Cannot open $ARGV[0]/test_exon.fa\n"); $mmOrder = $ARGV[2]; # Markov model order. # INITIALIZE variables. @handles = ('ITRAIN','ETRAIN','ITEST','ETEST'); $/ = '>'; $i = 0; $itrain_start = $itrain_end = 0; $etrain_start = $etrain_end = 0; $itest_start = $itest_end = 0; $etest_start = $etest_end = 0; $minimum_length = 5 * ($mmOrder+1); $remaining_length = 0; # READ sequence files. foreach $handle ( @handles ) { if ( $handle eq 'ITRAIN' ) { $itrain_start = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_start = $i; } elsif ( $handle eq 'ITEST' ) { $itest_start = $i; } else { $etest_start = $i; } while( $record = <$handle> ) { @lines = split("\n", $record); $header = shift(@lines); chomp(@lines); $sequence = lc(join('',@lines)); $sequence = trim($sequence); $current_length = length($sequence); # if too short, skip it. $remaining_length = $current_length - $frameshift; if ( $remaining_length < $minimum_length ) { next; } else { $nt_count = ($sequence =~ tr/atgc//); if ( $current_length != $nt_count ) { next; } else { $sequences[$i] = substr($sequence, $frameshift, $remaining_length); $i++; } } } if ( $handle eq 'ITRAIN' ) { $itrain_end = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_end = $i; } elsif ( $handle eq 'ITEST' ) { $itest_end = $i; } else { $etest_end = $i; } close($handle); } # INITIALIZE markov model variables. %initE = %initI = %transE = %transI = (); $initLen = $mmOrder; $transLen = $mmOrder + 1; @scoresI = @scoresE = (); # Open output file. open( OUT, '>', "$ARGV[1]") or die("Cannot open $ARGV[1].\n"); # TRAIN the classifier. # Process introns. for( $i = $itrain_start; $i < $itrain_end; $i++ ) { $current_length = length($sequences[$i]); for($pos = 0; $pos < ($current_length-$initLen); $pos++) { $key = substr($sequences[$i], $pos, $initLen); $initI{$key}++; $sumInitI++; } for($pos = 0; $pos <= ($current_length-$transLen); $pos++) { $key = substr($sequences[$i], $pos, $transLen); $transI{$key}++; $sumTransI++; } } # Process exons. for( $i = $etrain_start; $i < $etrain_end; $i++ ) { $current_length = length($sequences[$i]); for($pos = 0; $pos <= ($current_length-$initLen); $pos++) { $key = substr($sequences[$i], $pos, $initLen); $initE{$key}++; $sumInitE++; } for($pos = 0; $pos <= ($current_length-$transLen); $pos++) { $key = substr($sequences[$i], $pos, $transLen); $transE{$key}++; $sumTransE++; } } # Convert occurrences to frequencies. foreach $key ( keys(%initE) ) { $initE{$key} /= $sumInitE; $initI{$key} /= $sumInitI; } # TEST data. # Process introns. for( $i = $itest_start; $i < $itest_end; $i++ ) { $current_length = length($sequences[$i]); $which = $i - $itest_start; $scoresI[$which] = 0; $key = substr( $sequences[$i], 0, $initLen); $probI = log($initI{$key}); $probE = log($initE{$key}); for($pos = 0; $pos <= ($current_length-$transLen); $pos++ ) { $key = substr( $sequences[$i], $pos, $transLen); $keyA = substr( $key, 0, $initLen ) . 'a'; $keyG = substr( $key, 0, $initLen ) . 'g'; $keyC = substr( $key, 0, $initLen ) . 'c'; $keyT = substr( $key, 0, $initLen ) . 't'; # Calculate the denominators and log probabilities. $iDenom = $transI{$keyA}+$transI{$keyG}; $iDenom += $transI{$keyC}+$transI{$keyT}; $probI += log($transI{$key} / $iDenom ); $eDenom = $transE{$keyA}+$transE{$keyG}; $eDenom += $transE{$keyC}+$transE{$keyT}; $probE += log($transE{$key} / $eDenom ); } $scoresI[$which] = $probE - $probI; } # Process exons. for( $i = $etest_start; $i < $etest_end; $i++ ) { $current_length = length($sequences[$i]); $which = $i - $etest_start; $scoresE[$which] = 0; $key = substr( $sequences[$i], 0, $initLen); $probI = log($initI{$key}); $probE = log($initE{$key}); for($pos = 0; $pos <= ($current_length-$transLen); $pos++ ) { $key = substr( $sequences[$i], $pos, $transLen); $keyA = substr( $key, 0, $initLen ) . 'a'; $keyG = substr( $key, 0, $initLen ) . 'g'; $keyC = substr( $key, 0, $initLen ) . 'c'; $keyT = substr( $key, 0, $initLen ) . 't'; # Calculate the denominators and log probabilities. $iDenom = $transI{$keyA}+$transI{$keyG}; $iDenom += $transI{$keyC}+$transI{$keyT}; $probI += log($transI{$key} / $iDenom ); $eDenom = $transE{$keyA}+$transE{$keyG}; $eDenom += $transE{$keyC}+$transE{$keyT}; $probE += log($transE{$key} / $eDenom ); } $scoresE[$which] = $probE - $probI; } # Output the classification scores. print OUT "mm$mmOrder","\n",join(' ', @scoresI),"\n",join(' ', @scoresE),"\n\n"; close(OUT);