#!/usr/bin/perl -w # Perl script: MRI_analyzer.pl # # Authors: Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam, # Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # written April, 2007 # updated and expanded July, 2007 by Jason M. Bechtel # Contact afedorov@meduohio.edu # # LICENSE: GPL version 3 or later # If you use this code or incorporate it in another program, please cite our paper: # Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284. # # This Perl script does the following: # - scans a set of sequences (in FASTA format) for regions of inhomogeneity # o using specified window size # o using specified content type (GC/AT, AG/CT, A/CTG, etc.) # o using specified thresholds # - produces a stream of data for both contrasting content types indicating # the presence/absence of regions of high-level content # o eg: For GC-content, one sequence indicates the presence/absence of # GC-regions and a parallel sequence indicates the # presence/absence of AT-regions. # o the sequences included on lines of suprathreshold content are those # that initially trip the threshold within that window; they will # be offset from the literal start of the window by some amount. # o output is suitable for graphing using 'MRI_visualizer.pl' # # implement our own round() function sub round { my($number) = shift; return int($number + .5 * ($number <=> 0)); } if (scalar( @ARGV ) != 6) { print STDOUT "Usage: MRI_analyzer \n"; print STDOUT "\twhere is the name of a file containing sequences to be analyzed (in FASTA format),\n"; print STDOUT "\t is the file to write the output to,\n"; print STDOUT "\t is a string of characters considered to be the primary content type\n"; print STDOUT "\t\t(the complementary content is automatically derived from the known alphabet),\n"; print STDOUT "\t is the window size to use when scanning the sequence,\n"; print STDOUT "\t is the upper threshold (in nucleotides) for justifying a region of primary content, and\n"; print STDOUT "\t is the lower threshold (in nucleotides) for justifying a region of complementary content,\n"; print STDOUT "\n"; exit 1; } # define the alphabet we are working in (standardize on uppercase) $alphabet = uc( "ATCG" ); $seqfile = $ARGV[0]; $outfile = $ARGV[1]; $content = uc( $ARGV[2] ); $content =~ s/[^$alphabet]//g; $complement = $alphabet; $complement =~ s/[$content]//g; $maxchars = length( $alphabet ) - 1; if ( length( $content ) < 1 || length( $content ) > $maxchars ) { die " > Error: Invalid content specification [$ARGV[2]]; must be a string of 1 to $maxchars characters from [$alphabet]\n"; } # Check that is an integer greater than or equal to 10 $winsize = int( $ARGV[3] ); if ( $winsize != $ARGV[3] ) { die " > Error: Invalid oligonucleotide length [$ARGV[3]]; must be an integer (and >= 10)\n"; } if ( $winsize < 10 ) { die " > Error: Invalid oligonucleotide length [$ARGV[3]]; must be at least 10\n"; } # Check that is an integer $upper = int( $ARGV[4] ); if ( $upper != $ARGV[4] ) { die " > Error: Invalid oligonucleotide length [$ARGV[4]]; must be an integer\n"; } if ( $upper > $winsize ) { die " > Error: Invalid oligonucleotide length [$ARGV[4]]; must be less than or equal to \$winsize ($winsize)\n"; } # Check that is an integer $lower = int( $ARGV[5] ); if ( $lower != $ARGV[5] ) { die " > Error: Invalid oligonucleotide length [$ARGV[5]]; must be an integer\n"; } if ( $lower < 0 ) { die " > Error: Invalid oligonucleotide length [$ARGV[5]]; must be at least 0\n"; } # Define the upper threshold to constitute a region of complementary content $comp_upper = $winsize - $lower; # a quick sanity check to make sure that there is a "neutral" area between the two thresholds if ($upper - $lower <= 1) { print STDOUT " > Error: The specified thresholds do not allow for a neutral space (upper = $upper; lower = $lower)!\n"; print STDOUT " >> Please specify thresholds with a wider separation.\n"; print STDOUT " >> Remember that they are specified in nucleotides.\n"; print STDOUT "\n"; exit 1; } # flush on every write select(STDOUT); $| = 1; $/ = "\n>"; open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\": $!\n"; open( INFILE, "$seqfile" ) || die "Can't open \"$seqfile\": $!\n"; print STDOUT "Reading in sequences from \"$seqfile\"..."; $regions_pri = 0; # number of primary content regions $regions_comp = 0; # number of complementary content regions $global_pos = 0; # position in the overall stream %global_regions = (); # hash to hold region flags for each window position in the global string %global_seqs = (); # hash to hold sequence for each window position in the global string $num_recs_read = 0; while () { $num_recs_read++; @lines = split( /\n/, $_ ); chomp( @lines ); chomp( @lines ); $seq = ''; for $z ( 1 .. $#lines ) { # skip FASTA format comment lines if ( $lines[$z] =~ /^[>;]/ ) { next; } # convert to all-uppercase $lines[$z] = uc($lines[$z]); # strip out all white-space $lines[$z] =~ s/\s//g; $seq .= $lines[$z]; } $len = length($seq); $in_region = 1; # flag whether we are currently in a region (1) or not (0) # NOTE: must default to '1' to read first window (and initialize) $local_pos = 0; # position in the current sequence/record while ( $local_pos < ( $len - $winsize ) ) { if ($in_region) { # We are in a region of primary or complementary content (or just starting a sequence) # grab the next window of letters $window = substr( $seq, $local_pos, $winsize ); @nt = split( '', $window ); $count_pri = 0; # counter for primary content for the current window $count_comp = 0; # counter for complementary content for the current window foreach $x (@nt) { if ( $content =~ /$x/ ) { $count_pri++; } elsif ( $complement =~ /$x/ ) { $count_comp++; } } if ( $count_pri >= $upper ) { # upper threshold has been reached $in_region = 1; $local_pos += $winsize; # skip to next full window $global_pos += $winsize; $regions_pri++; $win_num = round ( $global_pos / $winsize ); $global_regions{ $win_num } = 1; $global_seqs{ $win_num } = $window; } elsif ( $count_comp >= $comp_upper) { # lower threshold has been reached $in_region = 1; $local_pos += $winsize; $global_pos += $winsize; $regions_comp++; $win_num = round ( $global_pos / $winsize ); $global_regions{ $win_num } = -1; $global_seqs{ $win_num } = $window; } else { # neither threshold has been reached $in_region = 0; $local_pos++; $global_pos++; $win_num = round ( $global_pos / $winsize ); if ( !exists $global_regions{ $win_num } ) { $global_regions{ $win_num } = 0; } } } else { # We are *NOT* in a region of primary or complementary content; # proceed carefully, but quickly... # slide one position along the sequence $old = shift(@nt); # dump out foremost position from window (and save it) $new = substr( $seq, ( $local_pos + $winsize ), 1 ); # get next position from source sequence push( @nt, $new ); # we now have our new window $window = join( '', @nt ); if ( $content =~ /$old/ ) { $count_pri--; } # adjust the count based on the departing position if ( $content =~ /$new/ ) { $count_pri++; } # adjust the count based on the incoming position if ( $complement =~ /$old/ ) { $count_comp--; } # adjust the count based on the departing position if ( $complement =~ /$new/ ) { $count_comp++; } # adjust the count based on the incoming position if ( $count_pri >= $upper ) { # upper threshold has been reached $in_region = 1; $local_pos += $winsize; $global_pos += $winsize; $regions_pri++; $win_num = round ( $global_pos / $winsize ); $global_regions{ $win_num } = 1; $global_seqs{ $win_num } = $window; } elsif ( $count_comp >= $comp_upper) { # lower threshold has been reached $in_region = 1; $local_pos += $winsize; $global_pos += $winsize; $regions_comp++; $win_num = round ( $global_pos / $winsize ); $global_regions{ $win_num } = -1; $global_seqs{ $win_num } = $window; } else { # neither threshold has been reached $local_pos++; $global_pos++; $win_num = round ( $global_pos / $winsize ); if ( !exists $global_regions{ $win_num } ) { $global_regions{ $win_num } = 0; } } } } } close(INFILE); print STDOUT " $num_recs_read records read\n"; printf STDERR "%32s %10d\n", "~ primary content regions:", $regions_pri; printf STDERR "%32s %10d\n", "~ complementary content regions:", $regions_comp; printf STDERR "%32s %10d\n", "~ potential regions:", scalar keys %global_regions; printf STDERR "%32s %10d\n", "~ empty windows:", ( scalar ( keys (%global_regions) ) - ( $regions_pri + $regions_comp ) ); print OUTPUT "filename:", "\t", $seqfile, "\n"; print OUTPUT "content:", "\t", $content, "\n"; print OUTPUT "winsize:", "\t", $winsize, "\n"; print OUTPUT "upper_threshold:", "\t", $upper, "\n"; print OUTPUT "lower_threshold:", "\t", $lower, "\n"; print OUTPUT "primary_regions:", "\t", $regions_pri, "\n"; print OUTPUT "complementary_regions:", "\t", $regions_comp, "\n"; foreach $pos (sort { $a <=> $b } keys %global_regions) { $flag = $global_regions{$pos}; if ($flag == 1) { $seq = $global_seqs{$pos}; print OUTPUT $pos, "\t", $flag, "\t", "x", "\t", $seq, "\n"; } elsif ($flag == -1) { $seq = $global_seqs{$pos}; print OUTPUT $pos, "\t", "x", "\t", $flag, "\t", $seq, "\n"; } else { print OUTPUT $pos, "\t", "x", "\t", "x", "\n"; } } close(OUTPUT);