#!/usr/bin/perl -w

# Perl script:  SRI_generator.pl
#
# Authors:  Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam,
#	Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov
# Bioinformatics Lab, Department of Medicine
# Medical College of Ohio, Toledo, Ohio, USA
# written April, 2007
# updated and expanded July, 2007 by Jason M. Bechtel
# Contact afedorov@meduohio.edu
#
# LICENSE:  GPL version 3 or later
# If you use this code or incorporate it in another program, please cite our paper:
# 	Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284.
#
#
# This Perl script does the following:
#  - reads in the oligonucleotide composition tables (the output from SRI_analyzer.pl)
#  - generates all possible n-mers
#  - reads in a FASTA-formatted file of sequences and replaces them with randomized
#      sequences using the n-mer composition table
#     o  chooses a starting oligo at random based on the n-mer composition table
#     o  chooses next base at random based on the relative n-mer nucleotide composition
#
# eg:  Using 4-mers, the program starts the replacement sequence with a 4-mer, chosen at
#      random but using the probability distribution specified in the composition table.
#      Say the first 4-mer is AATA.  The script then enters a loop in which it considers
#      the last n-1 positions (in this case 'ATA') and builds a probability distribution
#      for all n-mers starting with that n-1-mer.  In this case, the 4-mers would be
#      'ATAA', 'ATAT', 'ATAC', and 'ATAG'.  It then chooses one of these based on their
#      relative frequencies as indicated in the composition table.  Say it chooses
#      'ATAG'.  Now the string is 'AATAG' and the last n-1 positions are 'TAG'.  The
#      program now considers the n-mers 'TAGA', 'TAGT', 'TAGC', and 'TAGG' and continues
#      in this way until it has produced a sequence that is the same length as that in
#      the input file ($seqfile).
#
# Caveats:
#      Theoretically, this script could receive an input file that would lead to an
#      indeterminate state.  This situation would be such that all productions stemming
#      from the last n-1 mer of a represented n-mer are not represented.  By that I mean
#      that, for example, TTAG has a non-zero frequency in the table and could therefore
#      be chosen at random, but each production TAG* has a frequency of zero.  Thus the
#      program is stuck and cannot proceed while being faithful to the frequencies
#      provided.  In reality, this situation should rarely occur, but it depends on the
#      amount of sample sequence provided to the SRI_analyzer.pl program and the N-mer
#      level chosen for this program.  In the case that this does occur (and it will
#      occur), this program has been written to choose the next position at random from
#      the alphabet.  It will also issue a warning.
#
# TODO:  Add option for generating an arbitrary length of semi-random sequence (without
#        reading from a file).
#

if (scalar(@ARGV) != 4) {
    print STDOUT "Usage:  SRI_generator.pl <NTcompositionfile> <infile.fa> <outfile> <N>\n";
    print STDOUT "\twhere <NTcompositionfile> is the name of a file with the output from the SRI_analyzer.pl script,\n";
    print STDOUT "\t<infile.fa> is the name of a file containing sequences to be \"mocked\" (in FASTA format),\n";
    print STDOUT "\t<outfile> is the file to write the new pseudo-random replacement sequences to, and\n";
    print STDOUT "\t<N> is the size of oligonucleotides to use from the composition tables.\n";
    print STDOUT "\n";
    exit 1;
}

$NTcompfile = $ARGV[0];
$seqfile    = $ARGV[1];
$outfile    = $ARGV[2];
$N          = int( $ARGV[3] );
# Check that $N is an integer greater than 2
if ( $N != $ARGV[3] ) { die " > Error:  Invalid oligonucleotide length [$ARGV[3]]; <N> must be an integer (and >= 2)\n"; }
if ( $N < 2 ) { die " > Error:  Invalid oligonucleotide length [$ARGV[3]]; <N> must be at least 2\n"; }

# define the alphabet we are working in
$alphabet = "ATCG";
@letters  = split( '', uc($alphabet) );

# flush on every write
select(STDOUT); $| = 1;

# read in the oligonucleotide composition tables from $NTcompfile (the output from SRI_analyzer.pl)
%freq = ();
open( COMP_TABLES, "$NTcompfile" ) || die "Can't open \"$NTcompfile\": $!\n";
print STDOUT "Reading in composition tables from \"$NTcompfile\"...";
$count = 0;
$num_recs_read = 0;
while (<COMP_TABLES>) {
    chomp; chomp;
    $count++;
    # expected syntax
    if ( $_ =~ /^([$alphabet]+)\s+([0-9\.]+)/i ) {
        $num_recs_read++;
        # we only care about N-mers
        if ( $_ =~ /^([$alphabet]{$N})\s+([0-9\.]+)/i ) {
            $freq{uc($1)} = $2;
        }
    }
    # ignore empty lines
    elsif ( $_ =~ /^$/ ) { next; }
    # ignore entries for character sets
    elsif ( $_ =~ /^[\[A-Z]/ ) { next; }
    # bad format
    else { print STDERR "[\"$NTcompfile\"] > Warning: bad format on line $count: [$_]\n"; }
}
close( COMP_TABLES );
printf STDOUT "  %d entries found; (%d for %s-mers)\n", $num_recs_read, scalar keys( %freq ), $N;
if ( scalar keys( %freq ) == 0 ) { die " > Error:  no NT composition data for $N-mers.\n >> Check the SRI_analyzer.pl output file [\"$NTcompfile\"].\n" }

print STDOUT "Initializing...";
# generate all possible n-mers by building up from monomers
@new = @letters;
for $n ( 1 .. $N ) {
    @{ 'NT' . $n } = @new;
    @new = ();
    foreach $x (@letters) {
        foreach $y ( @{ 'NT' . $n } ) {
            push( @new, ( $x . $y ) );
        }
    }
}
# we only care about N-mers
@nmers = @{ 'NT' . $N };
for $n ( 1 .. $N ) { undef( @{ 'NT' . $n }); }

$sum_n = 0;  # accumulator for probabilities
@sums = ();  # hold probability demarcations for oligos so we can pick the starting oligo
$factor = 10**($N+2); # factor to eliminate decimal places (eg:  4-mers + 2 => 6 zeros)
for $x ( 0 .. $#nmers ) {
    if (!exists($freq{ $nmers[$x] })) { die " > Error:  incomplete NT composition for $N-mers.\n >> Check the SRI_analyzer.pl output file [\"$NTcompfile\"].\n" }
    $freq{ $nmers[$x] } *= $factor;
    $sum_n += $freq{ $nmers[$x] };
    push( @sums, $sum_n );
}
print STDOUT "  done\n";

$/ = "\n>";
open( INFILE, "$seqfile" )  || die "Can't open \"$seqfile\": $!\n";
open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\": $!\n";
print STDOUT "Processing sequences from \"$seqfile\"...";
$num_recs_read = 0;
$num_recs_processed = 0;
while (<INFILE>) {
    $num_recs_read++;
    $seq    = '';    # original sequence
    $string = '';    # generated "random" string
    $len    = 0;     # length of the original sequence and of the string to be generated
    @lines = split( /\n/, $_ );
    chomp( @lines ); chomp( @lines );
    if ($num_recs_read == 1) {
        if (!$lines[0] =~ /^>/) { die " does not appear to be a FASTA-formatted file\n"; }
    }
    for $z ( 1 .. $#lines ) {
        # skip FASTA format comment lines
        if ( $lines[$z] =~ /^[>;]/ ) { next; }
        # convert to all-uppercase
        $lines[$z] = uc($lines[$z]);
        # strip out all non-letters
        # NOOOO!  There are particular accepted forms of punctuation in genomic sequences
        # and we want to replace them with random sequence.  --jmb, 7/17/2007
        # UPDATE: ... we want to *maintain* them in the random sequence (see below).  --jmb, 10/4/2007
        #$lines[$z] =~ s/[^A-Z]//g;
        # strip out all white-space
        $lines[$z] =~ s/\s//g;
        $seq .= $lines[$z];
    }
    $len = length($seq);
    if ( $len < $N ) {
        print STDERR "\n > Warning:  skipping short sequence: [$seq] (record #$num_recs_read)!\n";
        next;
    }

    # choose starting oligo at random, based on the oligonucleotide composition
    $r = rand($sum_n);
    for $x ( 0 .. $#nmers ) {
        if ( $sums[$x] > $r ) { $string = $nmers[$x]; last; }
    }

    for ( 1 .. ( $len - $N ) ) {
        # get last N-1-mer from generated string ("tail")
        $tail = substr( $string, -($N-1) );
        $accum = 0;    # accumulator for probabilities
        @demarc = ();  # probability demarcations for oligos starting with $tail
        @next = ();    # list of N-mer oligos that start with $tail
        for $x ( 0 .. $#letters ) { push( @next, $tail . $letters[$x] ); }

        # build demarcation list for these oligos
        for $x ( 0 .. $#letters ) {
            $accum += $freq{ $next[$x] };
            push( @demarc, $accum );
        }

        # sanity check to make sure that we don't have all zero probabilities...
        if ( $demarc[$#letters] == 0 ) {
            # choose one at random
            print STDERR " > Warning:  stuck in zero-frequency hole: [$tail]!\n";
            $r2 = rand( $N-1 );
            $new = $letters[$r2];
            print STDERR " >> choosing next position at random and continuing... [$new]!\n";
        } else {
            # choose one at random based on the relative N-mer nucleotide composition
            $r2 = rand( $demarc[$#letters] + 1 );
            for $x ( 0 .. $#letters ) {
                if ( $demarc[$x] > $r2 ) { $new = $letters[$x]; last; }
            }
        }
        $string .= $new;
    }

    # if there are any non-canonical bases (N, X, '.', etc.) in the original sequence,
    # transfer them into the new generated string to preserve that lack of knowledge. --jmb, 10/4/2007
    if ( ( $seq =~ /[^$alphabet]/ ) ) {
        @seqchars = split( '', $seq );
        @newchars = split( '', $string );
        for $pos ( 0 .. $#seqchars ) {
            $char = $seqchars[$pos];
            if ( !( $char =~ /[$alphabet]/ ) ) {
                $newchars[$pos] = $char;
            }
        }
        $string = join( '', @newchars );
    }

    $num_recs_processed++;
    # write the resulting semi-random string to the output file
    print OUTPUT "> entry $num_recs_processed\n$string\n\n";
}
print STDOUT "  $num_recs_processed records processed\n";