#!/usr/bin/perl -w

# Perl script: NTcomposition.pl
# Authors:  Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam,
#	Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov
# Bioinformatics Lab, Department of Medicine
# Medical College of Ohio, Toledo, Ohio, USA
# written April, 2007
# updated and expanded July, 2007 by Jason M. Bechtel
# Contact alexei.fedorov@utoledo.edu
#
# LICENSE:  GPL version 3 or later
# If you use this code or incorporate it in another program, please cite our paper:
# 	Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284.
#
# This Perl script does the following:
#  - Section 1 creates all possible oligonucleotide strings up to $N-mers and initializes variables
#  - Section 2 reads the input file (in FASTA format), gets the nucleotide sequences,
#      and counts their oligonucleotide composition, keeping the counts in associative
#      arrays:  %nt1 for single nucleotides; %nt2 for dinucleotides; ... %ntN for N-mers
#  - Section 3 counts the total number of oligonucleotides for each associative array %ntN
#  - Section 4 calculates the frequency of each oligonucleotide in the associative array
#  - Section 5 prints the frequency of each oligonucleotide to the output file
#     o  This output is suitable for generating sequences using "prog_rand_seq".
#  - Section 6 prints the most and least common oligonucleotides to another output file
# This version of the script attempts to handle non-canonical positions (eg: X,N,R,Y) rationally.
# It identifies the overall canonical and non-canonical positions, splits up each sequence at any
# non-canonical positions/runs, and then analyses the resulting sub-sequences.  The overall
# canonical vs. non-canonical content is added to the main output file and a warning is written
# to STDERR.
#
# TODO:  Handle *all* non-canonical "letters" (eg: '.', 'n', etc.)
#

if (scalar(@ARGV) != 3) {
    print STDOUT "Usage:  NTcomposition.pl <infile.fa> <outfile> <oligo_max>\n";
    print STDOUT "\twhere <infile.fa> is the name of a file containing sequences to be analyzed (in FASTA format),\n";
    print STDOUT "\t<outfile> is the name of a file to hold the output,\n";
    print STDOUT "\tand <oligo_max> is the longest oligonucleotide size to be analyzed (1 to 9)\n";
    print STDOUT "\n";
    exit 1;
}

$infile = $ARGV[0]; # name of file containing sequences to analyze (in FASTA format)
$outfile = $ARGV[1]; # name of file to write resulting oligonucleotide frequencies to
$outfile2 = $ARGV[1] . ".tbl"; # name of file to write resulting oligonucleotide frequencies to
$N = int( $ARGV[2] ); # longest oligonucleotide size to appear in the output table
# Check that $N is an integer between 1 and 9 (10 and more take a lot of memory)
if ( $N != $ARGV[2] ) { die " > Error:  Invalid oligonucleotide length [$ARGV[2]]; <N> must be an integer (and 9 >= N >= 2)\n"; }
if ( $N < 1 || $N > 9 ) { die "Invalid oligonucleotide length [$ARGV[2]]; (9 >= N >= 2)\n"; }

# flush on every write
select(STDOUT); $| = 1;

# Section 1 creates all possible oligonucleotide strings up to $N-mers and initializes variables
# Define the alphabet
$alphabet = "ATCG";
print STDOUT "Initializing...";
@letters = split( '', $alphabet );
# Generate all possible n-mers in this alphabet...
@new = @letters;
for $n ( 1 .. $N ) {
    @{ 'NT' . $n } = @new;
    @new = ();
    foreach $x (@letters) {
        foreach $y ( @{ 'NT' . $n } ) {
            push( @new, ( $x . $y ) );
        }
    }
}
# Initialize variables
for $n ( 1 .. $N ) {
    ${ 'total' . $n } = 0; # total # of oligos counted for each oligo size
    %{ 'nt' . $n } = (); # oligo counts for a given oligo size
    %{ 'ntf' . $n } = (); # oligo frequencies for a given oligo size
    foreach $oligo (@{ 'NT' . $n }) {
        ${ 'nt' . $n }{ $oligo } = 0;
        ${ 'ntf' . $n }{ $oligo } = 0;
    }
}
$total_len = 0;
$total_noncanon = 0;
$overall_pct_noncanon = 0;
print STDOUT "  done\n";

# Section 2 reads the input file (in FASTA format), gets the nucleotide sequences,
# and counts their oligonucleotide composition, keeping the counts in associative
# arrays:  %nt1 for single nucleotides; %nt2 for dinucleotides; ... %ntN for N-mers
$/ = "\n>";
open( INFILE, "$infile" )  || die "Can't open \"$infile\":  $!\n";
open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\":  $!\n";
print STDOUT "Reading in sequences from \"$infile\"...";
$num_recs_read = 0;
$num_recs_processed = 0;
while (<INFILE>) {
    $num_recs_read++;
    $seq = '';
    @lines = split( "\n", $_ );
    chomp( @lines ); chomp( @lines );
    if ($num_recs_read == 1) {
        if ($lines[0] !~ /^>/) { die " does not appear to be a FASTA-formatted file\n"; }
    }
    for $z ( 1 .. $#lines ) {
        # skip FASTA format comment lines
        if ( $lines[$z] =~ /^[>;]/ ) { next; }
        # convert to all-uppercase
        $lines[$z] = uc($lines[$z]);
        # strip out all non-letters
        # NOOOO!  There are particular accepted forms of punctuation in genomic sequences
        # and we want to characterize their presence.  --jmb, 7/17/07
        #$lines[$z] =~ s/[^A-Z]//g;
        # strip out all white-space
        $lines[$z] =~ s/\s//g;
        $seq .= $lines[$z];
    }
    $length = length($seq);
    $total_len += $length;
    @seqs = ();  # added this reinit. to avoid rescanning seqs on empty records ~ jmb, 4/7/09
    if ( $length > 0 ) {
        $full_length = length($seq);
        # remove non-canonical bases (uncertain positions, punctuation, etc.)
        @seqs = ( $seq );
        $seq =~ s/[^$alphabet]//g;
        # recalculate length
        $length = length($seq);
        $num_noncanon = $full_length - $length;
        $total_noncanon += $num_noncanon;
        #$pct_noncanon = $num_noncanon / $full_length * 100;
        if ( $num_noncanon > 0 ) {
            @seqs = split( /[^$alphabet]+/, $seqs[0] );
        }
    }
    # for each sub-sequence (if non-canonical characters caused the sequence to be split)
    foreach $seq ( @seqs ) {
        $length = length($seq);
        if ( $length > 0 ) {
            for $n ( 1 .. $N ) {
                if ($length >= $n) {
                    # count all oligos of length $n
                    for $x ( 0 .. ( $length - $n ) ) { # NOT "$length - $n + 1"! (7/16/07 --jmb)
                        # substr offsets start at zero (0)!
                        ${ 'nt' . $n }{ substr( $seq, $x, $n ) }++;
                    }
                }
            }
        }
    }
    $num_recs_processed++;
}
close (INFILE);
print STDOUT "  $num_recs_read records found\n";
if ($num_recs_read == 0) { die "No FASTA records in input file [\"$infile\"]!\nAborting...\n"; }
if ( $total_noncanon > 0 ) {
    $overall_pct_noncanon = $total_noncanon / $total_len * 100;
    printf STDERR " > Warning:  %d non-canonical positions found comprising %.2f%% of bulk sequence data\n\n", $total_noncanon, $overall_pct_noncanon;
}
print STDOUT "Calculating oligonucleotide frequencies...";

# Section 3 counts the total number of oligonucleotides for each associative array %ntN
for $n ( 1 .. $N ) {
    foreach $mer ( @{ 'NT' . $n } ) {
        ${ 'total' . $n } += ${ 'nt' . $n }{$mer};
    }
}

# Section 4 calculates the frequency of each oligonucleotide in the associative array
for $n ( 1 .. $N ) {
    $z = $n + 2; # decimal places of precision for this N-mer level
    if ( ${ 'total' . $n } == 0 ) {
        foreach $mer ( @{ 'NT' . $n } ) {
            ${ 'ntf' . $n }{$mer} =
              sprintf( "%.${z}f", 0 );
        }
    } else {
        foreach $mer ( @{ 'NT' . $n } ) {
            ${ 'ntf' . $n }{$mer} =
              sprintf( "%.${z}f", ${ 'nt' . $n }{$mer} / ${ 'total' . $n } );
        }
    }
}

print STDOUT "  done\n";
print STDOUT "Writing frequencies to \"$outfile\"...";

# Section 5 prints the frequency of each oligonucleotide to the output file
for $n ( 1 .. $N ) {
    @mers = @{ 'NT' . $n };
    foreach $mer ( 0 .. $#mers ) {
        print OUTPUT $mers[$mer], "\t", ${ 'ntf' . $n }{ $mers[$mer] }, "\t", ${ 'nt' . $n }{ $mers[$mer] }, "\n";
    }
    if ( $n == 1 && $total_noncanon > 0 ) {
        $total_canon = ( $total_len - $total_noncanon );
        $overall_pct_canon = $total_canon / $total_len * 100;
        $overall_pct_noncanon = $total_noncanon / $total_len * 100;
        #printf OUTPUT  "[%s]\t%.3f\t%d\n", $alphabet, $overall_pct_canon / 100, $total_canon;
        #printf OUTPUT "[^%s]\t%.3f\t%d\n", $alphabet, $overall_pct_noncanon / 100, $total_noncanon;
        printf OUTPUT  "[%s]\t%.3f\t%d\n", "N", $overall_pct_canon / 100, $total_canon;
        printf OUTPUT "[!%s]\t%.3f\t%d\n", "N", $overall_pct_noncanon / 100, $total_noncanon;
    }
    print OUTPUT "\n\n";
}
close (OUTPUT);

print STDOUT "  done\n";
print STDOUT "Writing most- and least-common n-mers table to \"$outfile2\"...";

# Section 6 prints the most and least common oligonucleotides to another output file
open( OUTPUT, ">$outfile2" ) || die "Can't open \"$outfile2\":  $!\n";
print OUTPUT "    Most- and Least-common N-mers\n\n";
print OUTPUT "\tmost\tleast\n";
for $n ( 1 .. $N ) {
    #%freqs = %{ 'ntf' . $n };
    # sort the hash of frequencies by value
    #@sorted = sort { $freqs{$a} <=> $freqs{$b} } keys %freqs;
    # sorting takes too much memory for 9-mers and up...  just traverse the hash
    %counts = %{ 'nt' . $n };
    $max_count = -1;
    @max_idxs = ();
    $min_count = -1;
    @min_idxs = ();
    while(($mer, $count) = each(%counts)) {
        # grab the highest- and lowest-count n-mers
        if ($count > $max_count || $max_count == -1) {
            @max_idxs = ();
            push(@max_idxs, $mer);
            $max_count = $count;
        }
        # track all oligos with max/min count
        elsif ($count == $max_count) {
            push(@max_idxs, $mer);
        }
        if ($count < $min_count || $min_count == -1) {
            @min_idxs = ();
            push(@min_idxs, $mer);
            $min_count = $count;
        }
        elsif ($count == $min_count) {
            push(@min_idxs, $mer);
        }
    }
    print OUTPUT "$n-mers";
    print OUTPUT "\t", $counts{ $max_idxs[0] };
    if (scalar(@max_idxs) >= 1) {
        print OUTPUT "/";
        print OUTPUT join(',', @max_idxs);
    }
    print OUTPUT "\t", $counts{ $min_idxs[0] };
    if (scalar(@min_idxs) >= 1) {
        print OUTPUT "/";
        print OUTPUT join(',', @min_idxs);
    }
    print OUTPUT "\n";
}
close (OUTPUT);
print STDOUT "  done\n";