#!/usr/bin/perl -w

# Perl script: MRI_analyzer.pl
#
# Authors:  Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam,
#	Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov
# Bioinformatics Lab, Department of Medicine
# Medical College of Ohio, Toledo, Ohio, USA
# written April, 2007
# updated and expanded July, 2007 by Jason M. Bechtel
# Contact afedorov@meduohio.edu
#
# LICENSE:  GPL version 3 or later
# If you use this code or incorporate it in another program, please cite our paper:
# 	Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284.
#
# This Perl script does the following:
#  - scans a set of sequences (in FASTA format) for regions of inhomogeneity
#     o  using specified window size
#     o  using specified content type (GC/AT, AG/CT, A/CTG, etc.)
#     o  using specified thresholds
#  - produces a stream of data for both contrasting content types indicating
#      the presence/absence of regions of high-level content
#     o  eg: For GC-content, one sequence indicates the presence/absence of
#            GC-regions and a parallel sequence indicates the
#            presence/absence of AT-regions.
#     o  the sequences included on lines of suprathreshold content are those
#            that initially trip the threshold within that window; they will
#            be offset from the literal start of the window by some amount.
#     o  output is suitable for graphing using 'MRI_visualizer.pl'
#

# implement our own round() function
sub round {
    my($number) = shift;
    return int($number + .5 * ($number <=> 0));
}


if (scalar( @ARGV ) != 6) {
    print STDOUT "Usage:  MRI_analyzer <infile.fa> <outfile> <content> <winsize> <upper> <lower>\n";
    print STDOUT "\twhere <infile.fa> is the name of a file containing sequences to be analyzed (in FASTA format),\n";
    print STDOUT "\t<outfile> is the file to write the output to,\n";
    print STDOUT "\t<content> is a string of characters considered to be the primary content type\n";
    print STDOUT "\t\t(the complementary content is automatically derived from the known alphabet),\n";
    print STDOUT "\t<winsize> is the window size to use when scanning the sequence,\n";
    print STDOUT "\t<upper> is the upper threshold (in nucleotides) for justifying a region of primary content, and\n";
    print STDOUT "\t<lower> is the lower threshold (in nucleotides) for justifying a region of complementary content,\n";
    print STDOUT "\n";
    exit 1;
}

# define the alphabet we are working in (standardize on uppercase)
$alphabet = uc( "ATCG" );

$seqfile = $ARGV[0];
$outfile = $ARGV[1];
$content = uc( $ARGV[2] );
$content =~ s/[^$alphabet]//g;
$complement = $alphabet;
$complement =~ s/[$content]//g;
$maxchars = length( $alphabet ) - 1;
if ( length( $content ) < 1 || length( $content ) > $maxchars ) { die " > Error:  Invalid content specification [$ARGV[2]]; <content> must be a string of 1 to $maxchars characters from [$alphabet]\n"; }
# Check that <winsize> is an integer greater than or equal to 10
$winsize = int( $ARGV[3] );
if ( $winsize != $ARGV[3] ) { die " > Error:  Invalid oligonucleotide length [$ARGV[3]]; <winsize> must be an integer (and >= 10)\n"; }
if ( $winsize < 10 ) { die " > Error:  Invalid oligonucleotide length [$ARGV[3]]; <winsize> must be at least 10\n"; }
# Check that <upper> is an integer
$upper = int( $ARGV[4] );
if ( $upper != $ARGV[4] ) { die " > Error:  Invalid oligonucleotide length [$ARGV[4]]; <upper> must be an integer\n"; }
if ( $upper > $winsize ) { die " > Error:  Invalid oligonucleotide length [$ARGV[4]]; <upper> must be less than or equal to \$winsize ($winsize)\n"; }
# Check that <lower> is an integer
$lower = int( $ARGV[5] );
if ( $lower != $ARGV[5] ) { die " > Error:  Invalid oligonucleotide length [$ARGV[5]]; <lower> must be an integer\n"; }
if ( $lower < 0 ) { die " > Error:  Invalid oligonucleotide length [$ARGV[5]]; <lower> must be at least 0\n"; }
# Define the upper threshold to constitute a region of complementary content
$comp_upper = $winsize - $lower;

# a quick sanity check to make sure that there is a "neutral" area between the two thresholds
if ($upper - $lower <= 1) {
    print STDOUT " > Error:  The specified thresholds do not allow for a neutral space (upper = $upper; lower = $lower)!\n";
    print STDOUT " >> Please specify thresholds with a wider separation.\n";
    print STDOUT " >> Remember that they are specified in nucleotides.\n";
    print STDOUT "\n";
    exit 1;
}

# flush on every write
select(STDOUT); $| = 1;

$/ = "\n>";
open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\": $!\n";
open( INFILE, "$seqfile" )  || die "Can't open \"$seqfile\": $!\n";
print STDOUT "Reading in sequences from \"$seqfile\"...";
$regions_pri    = 0; # number of primary content regions
$regions_comp   = 0; # number of complementary content regions
$global_pos     = 0; # position in the overall stream
%global_regions = (); # hash to hold region flags for each window position in the global string
%global_seqs    = (); # hash to hold sequence for each window position in the global string
$num_recs_read  = 0;
while (<INFILE>) {
    $num_recs_read++;
    @lines = split( /\n/, $_ );
    chomp( @lines ); chomp( @lines );
    $seq   = '';
    for $z ( 1 .. $#lines ) {
        # skip FASTA format comment lines
        if ( $lines[$z] =~ /^[>;]/ ) { next; }
        # convert to all-uppercase
        $lines[$z] = uc($lines[$z]);
        # strip out all white-space
        $lines[$z] =~ s/\s//g;
        $seq .= $lines[$z];
    }
    $len = length($seq);

    $in_region = 1; # flag whether we are currently in a region (1) or not (0)
                    # NOTE:  must default to '1' to read first window (and initialize)
    $local_pos = 0; # position in the current sequence/record
    while ( $local_pos < ( $len - $winsize ) ) {
        if ($in_region) {
            # We are in a region of primary or complementary content (or just starting a sequence)
            # grab the next window of letters
            $window     = substr( $seq, $local_pos, $winsize );
            @nt         = split( '', $window );
            $count_pri  = 0; # counter for primary content for the current window
            $count_comp = 0; # counter for complementary content for the current window
            foreach $x (@nt) {
                if ( $content =~ /$x/ ) { $count_pri++; }
                elsif ( $complement =~ /$x/ ) { $count_comp++; }
            }
            if ( $count_pri >= $upper ) {
                # upper threshold has been reached
                $in_region = 1;
                $local_pos += $winsize; # skip to next full window
                $global_pos += $winsize;
                $regions_pri++;
                $win_num = round ( $global_pos / $winsize );
                $global_regions{ $win_num } = 1;
                $global_seqs{ $win_num } = $window;
            } elsif ( $count_comp >= $comp_upper) {
                # lower threshold has been reached
                $in_region = 1;
                $local_pos += $winsize;
                $global_pos += $winsize;
                $regions_comp++;
                $win_num = round ( $global_pos / $winsize );
                $global_regions{ $win_num } = -1;
                $global_seqs{ $win_num } = $window;
            } else {
                # neither threshold has been reached
                $in_region = 0; $local_pos++; $global_pos++;
                $win_num = round ( $global_pos / $winsize );
                if ( !exists $global_regions{ $win_num } ) { $global_regions{ $win_num } = 0; }
            }
        } else {
            # We are *NOT* in a region of primary or complementary content;
            #   proceed carefully, but quickly...
            # slide one position along the sequence
            $old = shift(@nt); # dump out foremost position from window (and save it)
            $new = substr( $seq, ( $local_pos + $winsize ), 1 ); # get next position from source sequence
            push( @nt, $new ); # we now have our new window
            $window = join( '', @nt );
            if ( $content =~ /$old/ ) { $count_pri--; } # adjust the count based on the departing position
            if ( $content =~ /$new/ ) { $count_pri++; } # adjust the count based on the incoming position
            if ( $complement =~ /$old/ ) { $count_comp--; } # adjust the count based on the departing position
            if ( $complement =~ /$new/ ) { $count_comp++; } # adjust the count based on the incoming position
            if ( $count_pri >= $upper ) {
                # upper threshold has been reached
                $in_region = 1;
                $local_pos += $winsize;
                $global_pos += $winsize;
                $regions_pri++;
                $win_num = round ( $global_pos / $winsize );
                $global_regions{ $win_num } = 1;
                $global_seqs{ $win_num } = $window;
            } elsif ( $count_comp >= $comp_upper) {
                # lower threshold has been reached
                $in_region = 1;
                $local_pos += $winsize;
                $global_pos += $winsize;
                $regions_comp++;
                $win_num = round ( $global_pos / $winsize );
                $global_regions{ $win_num } = -1;
                $global_seqs{ $win_num } = $window;
            } else {
                # neither threshold has been reached
                $local_pos++; $global_pos++;
                $win_num = round ( $global_pos / $winsize );
                if ( !exists $global_regions{ $win_num } ) { $global_regions{ $win_num } = 0; }
            }
        }
    }
}
close(INFILE);
print STDOUT "  $num_recs_read records read\n";
printf STDERR "%32s %10d\n", "~ primary content regions:", $regions_pri;
printf STDERR "%32s %10d\n", "~ complementary content regions:", $regions_comp;
printf STDERR "%32s %10d\n", "~ potential regions:", scalar keys %global_regions;
printf STDERR "%32s %10d\n", "~ empty windows:", ( scalar ( keys (%global_regions) ) - ( $regions_pri + $regions_comp ) );

print OUTPUT "filename:", "\t", $seqfile, "\n";
print OUTPUT "content:", "\t", $content, "\n";
print OUTPUT "winsize:", "\t", $winsize, "\n";
print OUTPUT "upper_threshold:", "\t", $upper, "\n";
print OUTPUT "lower_threshold:", "\t", $lower, "\n";
print OUTPUT "primary_regions:", "\t", $regions_pri, "\n";
print OUTPUT "complementary_regions:", "\t", $regions_comp, "\n";
foreach $pos (sort { $a <=> $b } keys %global_regions) {
    $flag = $global_regions{$pos};
    if ($flag == 1) {
        $seq = $global_seqs{$pos};
        print OUTPUT $pos, "\t", $flag, "\t", "x", "\t", $seq, "\n";
    } elsif ($flag == -1) {
        $seq = $global_seqs{$pos};
        print OUTPUT $pos, "\t", "x", "\t", $flag, "\t", $seq, "\n";
    } else {
        print OUTPUT $pos, "\t", "x", "\t", "x", "\n";
    }
}
close(OUTPUT);