#!/usr/bin/perl # convertTrainTestMM.pl - Samuel Shepard - 11.2009 # Convert a nucleotide sequence to binary based on the map. # Train and test the sequences using a binary markov model. # This program is memory intensive. # Contact sammysheep@gmail.com # LICENSE: GPL version 3 or later # Define some global variables, necessary for enumerate(). $K = 0; %map = (); @letters = ('a','t','c','g'); sub enumerate($$) { my $level = shift; my $tail = shift; my $base; if ( $level == 0 ) { $map{$tail} = $K; $K++; } else { for $base ( @letters ) { enumerate(($level-1),"$tail$base"); } } } # GET the input parameters. if ( scalar(@ARGV) < 5 ) { $message = "\nUsage:\n\tperl $0 "; $message .= "\n\t\t "; $message .= "\n\t\t [frame]\n"; die( $message ); } # PROCESS the parameters. $mmOrder = $ARGV[3]; # The markov model order. $mapOrder = $ARGV[4]; # The map order/abstraction level. # Frame shift the data if told to. if ( scalar(@ARGV) > 5 ) { $frameshift = $ARGV[5]; } else { $frameshift = 0; } # Store the abstraction rules. open( MAPS, '<', $ARGV[2] ) or die("Could not open map file $ARGV[2].\n"); $/ = "\n"; @maps = ; chomp(@maps); $number_maps = scalar(@maps); close(MAPS); # Read in the datasets. # Currently assumes fixed naming scheme of exons v. introns. open( ITRAIN, '<', "$ARGV[0]/training_intron.fa") or die("Cannot open $ARGV[0]/training_intron.fa\n"); open( ETRAIN, '<', "$ARGV[0]/training_exon.fa") or die("Cannot open $ARGV[0]/training_exon.fa\n"); open( ITEST, '<', "$ARGV[0]/test_intron.fa") or die("Cannot open $ARGV[0]/test_intron.fa\n"); open( ETEST, '<', "$ARGV[0]/test_exon.fa") or die("Cannot open $ARGV[0]/test_exon.fa\n"); # INITIALIZE variables. @handles = ('ITRAIN','ETRAIN','ITEST','ETEST'); $/ = '>'; $i = 0; $itrain_start = $itrain_end = 0; $etrain_start = $etrain_end = 0; $itest_start = $itest_end = 0; $etest_start = $etest_end = 0; $initBits = $mmOrder; $transBits = $mmOrder + 1; $minimum_length = $mapOrder * ($mmOrder+1); $remaining_length = 0; # READ sequence files. foreach $handle ( @handles ) { if ( $handle eq 'ITRAIN' ) { $itrain_start = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_start = $i; } elsif ( $handle eq 'ITEST' ) { $itest_start = $i; } else { $etest_start = $i; } while( $record = <$handle> ) { @lines = split("\n", $record); $header = shift(@lines); chomp(@lines); $sequence = lc(join('',@lines)); $current_length = length($sequence); # if too short, skip it. $remaining_length = $current_length - $frameshift; if ( $remaining_length < $minimum_length ) { next; } else { $nt_count = ($sequence =~ tr/atgc//); if ( $current_length != $nt_count ) { next; } else { $sequences[$i] = substr($sequence, $frameshift, $remaining_length); $i++; } } } if ( $handle eq 'ITRAIN' ) { $itrain_end = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_end = $i; } elsif ( $handle eq 'ITEST' ) { $itest_end = $i; } else { $etest_end = $i; } close($handle); } # PROCESS abstraction rules. # Map order (e.g., the abstraction level). if ( $mapOrder == 3 ) { # load map $map{'ttc'} = 0; $map{'tgt'} = 1; $map{'ctt'} = 2; $map{'att'} = 3; $map{'agg'} = 4; $map{'atg'} = 5; $map{'gcg'} = 6; $map{'tta'} = 7; $map{'gct'} = 8; $map{'caa'} = 9; $map{'aat'} = 10; $map{'acg'} = 11; $map{'cgt'} = 12; $map{'tac'} = 13; $map{'cta'} = 14; $map{'cga'} = 15; $map{'aca'} = 16; $map{'ggc'} = 17; $map{'tgg'} = 18; $map{'ccg'} = 19; $map{'gca'} = 20; $map{'ggt'} = 21; $map{'tcg'} = 22; $map{'acc'} = 23; $map{'cat'} = 24; $map{'gag'} = 25; $map{'gtc'} = 26; $map{'act'} = 27; $map{'tcc'} = 28; $map{'cct'} = 29; $map{'gtg'} = 30; $map{'ttg'} = 31; $map{'agc'} = 32; $map{'atc'} = 33; $map{'gaa'} = 34; $map{'cca'} = 35; $map{'gat'} = 36; $map{'ttt'} = 37; $map{'ctg'} = 38; $map{'cgc'} = 39; $map{'aac'} = 40; $map{'tag'} = 41; $map{'tct'} = 42; $map{'gta'} = 43; $map{'tgc'} = 44; $map{'gac'} = 45; $map{'taa'} = 46; $map{'ccc'} = 47; $map{'ggg'} = 48; $map{'cag'} = 49; $map{'aaa'} = 50; $map{'aag'} = 51; $map{'ctc'} = 52; $map{'aga'} = 53; $map{'tca'} = 54; $map{'tga'} = 55; $map{'cgg'} = 56; $map{'ata'} = 57; $map{'gtt'} = 58; $map{'gga'} = 59; $map{'gcc'} = 60; $map{'cac'} = 61; $map{'agt'} = 62; $map{'tat'} = 63; } else { enumerate($mapOrder,''); } # INITIALIZE markov model variables. %initE = %initI = %transE = %transI = (); $patternsInit = 2**$initBits; $patternsTrans = 2**$transBits; # Open output file. open( OUT, '>', "$ARGV[1]") or die("Cannot open $ARGV[1].\n"); # PROCESS each map consecutively $mapI = 0; $map_size = 4**$mapOrder; @bit_sequences = @scoresI = @scoresE = (); for( $mapI = 0; $mapI < $number_maps;$mapI++ ) { # Initialize probabilities. $template = '%0'.$initBits.'b'; for( $i = 0; $i < $patternsInit; $i++ ) { $key = sprintf($template, $i); $initE{$key} = $initI{$key} = 0; } $template = '%0'.$transBits.'b'; for( $i = 0; $i < $patternsTrans; $i++ ) { $key = sprintf($template, $i); $transE{$key} = $transI{$key} = 0; } $sumInitI = $sumInitE = $sumTransI = $sumTransE = 0; # Load the abstraction rule. if ( length($maps[$mapI]) != ($map_size) ) { die( "Map '$maps[$mapI]' not correct length.\n"); } @bits = split('', $maps[$mapI] ); if ( $map_size != scalar(@bits) ) { die("Map size mismatch.\n"); } # Convert each sequence according to the current abstraction rule. for ( $i = 0; $i < $etest_end; $i++ ) { $current_length = length($sequences[$i]); $bit_sequences[$i] = ''; for ( $pos = 0; $pos <= ($current_length - $mapOrder); $pos += $mapOrder) { $key = substr($sequences[$i], $pos, $mapOrder); $bit_sequences[$i] .= $bits[$map{$key}]; } } # TRAIN the BAMM classifier. # Analyze the intron sequences. $totalI = 0; for( $i = $itrain_start; $i < $itrain_end; $i++ ) { $current_length = length($bit_sequences[$i]); for($pos = 0; $pos <= ($current_length-$initBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $initBits); $initI{$key}++; $sumInitI++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $transBits); $transI{$key}++; $sumTransI++; } } # Analyze the exon sequences. $totalE = 0; for( $i = $etrain_start; $i < $etrain_end; $i++ ) { $current_length = length($bit_sequences[$i]); for($pos = 0; $pos <= ($current_length-$initBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $initBits); $initE{$key}++; $sumInitE++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $transBits); $transE{$key}++; $sumTransE++; } } # Convert occurrences to frequencies. foreach $key ( keys(%initE) ) { $initE{$key} /= $sumInitE; $initI{$key} /= $sumInitI; } $emptyProb = 0; $totalUses = 0; # Test the introns group. for( $i = $itest_start; $i < $itest_end; $i++ ) { $current_length = length($bit_sequences[$i]); $which = $i - $itest_start; $scoresI[$which] = 0; $key = substr( $bit_sequences[$i], 0, $initBits); if ( $initI{$key} != 0 ) { $probI = log($initI{$key}); $totalUses++; } else { $emptyProb++; } if ( $initE{$key} ) { $probE = log($initE{$key}); $totalUses++; } else { $emptyProb++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++ ) { $key = substr( $bit_sequences[$i], $pos, $transBits); $key0 = substr( $key, 0, $initBits ) . '0'; $key1 = substr( $key, 0, $initBits ) . '1'; if( $transI{$key0} != 0 && $transI{$key1} != 0 ) { $totalUses++; $probI += log($transI{$key} / ($transI{$key0} + $transI{$key1} )); } else { $emptyProb++; } if( $transE{$key0} != 0 && $transE{$key1} != 0 ) { $totalUses++; $probE += log($transE{$key} / ($transE{$key0} + $transE{$key1} )); } else { $emptyProb++; } } $scoresI[$which] = $probE - $probI; } # Test the exons group. for( $i = $etest_start; $i < $etest_end; $i++ ) { $current_length = length($bit_sequences[$i]); $which = $i - $etest_start; $scoresE[$which] = 0; $key = substr( $bit_sequences[$i], 0, $initBits); if ( $initI{$key} != 0 ) { $totalUses++; $probI = log($initI{$key}); } else { $emptyProb++; } if ( $initE{$key} ) { $totalUses++; $probE = log($initE{$key}); } else { $emptyProb++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++ ) { $key = substr( $bit_sequences[$i], $pos, $transBits); $key0 = substr( $key, 0, $initBits ) . '0'; $key1 = substr( $key, 0, $initBits ) . '1'; if( $transI{$key0} != 0 && $transI{$key1} != 0 ) { $totalUses++; $probI += log($transI{$key} / ($transI{$key0} + $transI{$key1} )); } else { $emptyProb++; } if( $transE{$key0} != 0 && $transE{$key1} != 0 ) { $totalUses++; $probE += log($transE{$key} / ($transE{$key0} + $transE{$key1} )); } else { $emptyProb++; } } $scoresE[$which] = $probE - $probI; } # Output the result (classification scores). # Print out the statistics for the particular abstraction rule used. #$exponent = log($totalUses+$emptyProb)/log(10); $possibleUses = $emptyProb + $totalUses; $knowledgeUsage = sprintf("%0.6f", ( $totalUses / $possibleUses ) ); print "$knowledgeUsage\t($emptyProb)\t$maps[$mapI].\n"; print OUT $maps[$mapI],"\n",join(' ', @scoresI),"\n",join(' ', @scoresE),"\n\n"; } close(OUT);