#!/usr/bin/perl -w # Perl script: SRI_generator.pl # # Authors: Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam, # Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # written April, 2007 # updated and expanded July, 2007 by Jason M. Bechtel # Contact afedorov@meduohio.edu # # LICENSE: GPL version 3 or later # If you use this code or incorporate it in another program, please cite our paper: # Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284. # # # This Perl script does the following: # - reads in the oligonucleotide composition tables (the output from SRI_analyzer.pl) # - generates all possible n-mers # - reads in a FASTA-formatted file of sequences and replaces them with randomized # sequences using the n-mer composition table # o chooses a starting oligo at random based on the n-mer composition table # o chooses next base at random based on the relative n-mer nucleotide composition # # eg: Using 4-mers, the program starts the replacement sequence with a 4-mer, chosen at # random but using the probability distribution specified in the composition table. # Say the first 4-mer is AATA. The script then enters a loop in which it considers # the last n-1 positions (in this case 'ATA') and builds a probability distribution # for all n-mers starting with that n-1-mer. In this case, the 4-mers would be # 'ATAA', 'ATAT', 'ATAC', and 'ATAG'. It then chooses one of these based on their # relative frequencies as indicated in the composition table. Say it chooses # 'ATAG'. Now the string is 'AATAG' and the last n-1 positions are 'TAG'. The # program now considers the n-mers 'TAGA', 'TAGT', 'TAGC', and 'TAGG' and continues # in this way until it has produced a sequence that is the same length as that in # the input file ($seqfile). # # Caveats: # Theoretically, this script could receive an input file that would lead to an # indeterminate state. This situation would be such that all productions stemming # from the last n-1 mer of a represented n-mer are not represented. By that I mean # that, for example, TTAG has a non-zero frequency in the table and could therefore # be chosen at random, but each production TAG* has a frequency of zero. Thus the # program is stuck and cannot proceed while being faithful to the frequencies # provided. In reality, this situation should rarely occur, but it depends on the # amount of sample sequence provided to the SRI_analyzer.pl program and the N-mer # level chosen for this program. In the case that this does occur (and it will # occur), this program has been written to choose the next position at random from # the alphabet. It will also issue a warning. # # TODO: Add option for generating an arbitrary length of semi-random sequence (without # reading from a file). # if (scalar(@ARGV) != 4) { print STDOUT "Usage: SRI_generator.pl \n"; print STDOUT "\twhere is the name of a file with the output from the SRI_analyzer.pl script,\n"; print STDOUT "\t is the name of a file containing sequences to be \"mocked\" (in FASTA format),\n"; print STDOUT "\t is the file to write the new pseudo-random replacement sequences to, and\n"; print STDOUT "\t is the size of oligonucleotides to use from the composition tables.\n"; print STDOUT "\n"; exit 1; } $NTcompfile = $ARGV[0]; $seqfile = $ARGV[1]; $outfile = $ARGV[2]; $N = int( $ARGV[3] ); # Check that $N is an integer greater than 2 if ( $N != $ARGV[3] ) { die " > Error: Invalid oligonucleotide length [$ARGV[3]]; must be an integer (and >= 2)\n"; } if ( $N < 2 ) { die " > Error: Invalid oligonucleotide length [$ARGV[3]]; must be at least 2\n"; } # define the alphabet we are working in $alphabet = "ATCG"; @letters = split( '', uc($alphabet) ); # flush on every write select(STDOUT); $| = 1; # read in the oligonucleotide composition tables from $NTcompfile (the output from SRI_analyzer.pl) %freq = (); open( COMP_TABLES, "$NTcompfile" ) || die "Can't open \"$NTcompfile\": $!\n"; print STDOUT "Reading in composition tables from \"$NTcompfile\"..."; $count = 0; $num_recs_read = 0; while () { chomp; chomp; $count++; # expected syntax if ( $_ =~ /^([$alphabet]+)\s+([0-9\.]+)/i ) { $num_recs_read++; # we only care about N-mers if ( $_ =~ /^([$alphabet]{$N})\s+([0-9\.]+)/i ) { $freq{uc($1)} = $2; } } # ignore empty lines elsif ( $_ =~ /^$/ ) { next; } # ignore entries for character sets elsif ( $_ =~ /^[\[A-Z]/ ) { next; } # bad format else { print STDERR "[\"$NTcompfile\"] > Warning: bad format on line $count: [$_]\n"; } } close( COMP_TABLES ); printf STDOUT " %d entries found; (%d for %s-mers)\n", $num_recs_read, scalar keys( %freq ), $N; if ( scalar keys( %freq ) == 0 ) { die " > Error: no NT composition data for $N-mers.\n >> Check the SRI_analyzer.pl output file [\"$NTcompfile\"].\n" } print STDOUT "Initializing..."; # generate all possible n-mers by building up from monomers @new = @letters; for $n ( 1 .. $N ) { @{ 'NT' . $n } = @new; @new = (); foreach $x (@letters) { foreach $y ( @{ 'NT' . $n } ) { push( @new, ( $x . $y ) ); } } } # we only care about N-mers @nmers = @{ 'NT' . $N }; for $n ( 1 .. $N ) { undef( @{ 'NT' . $n }); } $sum_n = 0; # accumulator for probabilities @sums = (); # hold probability demarcations for oligos so we can pick the starting oligo $factor = 10**($N+2); # factor to eliminate decimal places (eg: 4-mers + 2 => 6 zeros) for $x ( 0 .. $#nmers ) { if (!exists($freq{ $nmers[$x] })) { die " > Error: incomplete NT composition for $N-mers.\n >> Check the SRI_analyzer.pl output file [\"$NTcompfile\"].\n" } $freq{ $nmers[$x] } *= $factor; $sum_n += $freq{ $nmers[$x] }; push( @sums, $sum_n ); } print STDOUT " done\n"; $/ = "\n>"; open( INFILE, "$seqfile" ) || die "Can't open \"$seqfile\": $!\n"; open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\": $!\n"; print STDOUT "Processing sequences from \"$seqfile\"..."; $num_recs_read = 0; $num_recs_processed = 0; while () { $num_recs_read++; $seq = ''; # original sequence $string = ''; # generated "random" string $len = 0; # length of the original sequence and of the string to be generated @lines = split( /\n/, $_ ); chomp( @lines ); chomp( @lines ); if ($num_recs_read == 1) { if (!$lines[0] =~ /^>/) { die " does not appear to be a FASTA-formatted file\n"; } } for $z ( 1 .. $#lines ) { # skip FASTA format comment lines if ( $lines[$z] =~ /^[>;]/ ) { next; } # convert to all-uppercase $lines[$z] = uc($lines[$z]); # strip out all non-letters # NOOOO! There are particular accepted forms of punctuation in genomic sequences # and we want to replace them with random sequence. --jmb, 7/17/2007 # UPDATE: ... we want to *maintain* them in the random sequence (see below). --jmb, 10/4/2007 #$lines[$z] =~ s/[^A-Z]//g; # strip out all white-space $lines[$z] =~ s/\s//g; $seq .= $lines[$z]; } $len = length($seq); if ( $len < $N ) { print STDERR "\n > Warning: skipping short sequence: [$seq] (record #$num_recs_read)!\n"; next; } # choose starting oligo at random, based on the oligonucleotide composition $r = rand($sum_n); for $x ( 0 .. $#nmers ) { if ( $sums[$x] > $r ) { $string = $nmers[$x]; last; } } for ( 1 .. ( $len - $N ) ) { # get last N-1-mer from generated string ("tail") $tail = substr( $string, -($N-1) ); $accum = 0; # accumulator for probabilities @demarc = (); # probability demarcations for oligos starting with $tail @next = (); # list of N-mer oligos that start with $tail for $x ( 0 .. $#letters ) { push( @next, $tail . $letters[$x] ); } # build demarcation list for these oligos for $x ( 0 .. $#letters ) { $accum += $freq{ $next[$x] }; push( @demarc, $accum ); } # sanity check to make sure that we don't have all zero probabilities... if ( $demarc[$#letters] == 0 ) { # choose one at random print STDERR " > Warning: stuck in zero-frequency hole: [$tail]!\n"; $r2 = rand( $N-1 ); $new = $letters[$r2]; print STDERR " >> choosing next position at random and continuing... [$new]!\n"; } else { # choose one at random based on the relative N-mer nucleotide composition $r2 = rand( $demarc[$#letters] + 1 ); for $x ( 0 .. $#letters ) { if ( $demarc[$x] > $r2 ) { $new = $letters[$x]; last; } } } $string .= $new; } # if there are any non-canonical bases (N, X, '.', etc.) in the original sequence, # transfer them into the new generated string to preserve that lack of knowledge. --jmb, 10/4/2007 if ( ( $seq =~ /[^$alphabet]/ ) ) { @seqchars = split( '', $seq ); @newchars = split( '', $string ); for $pos ( 0 .. $#seqchars ) { $char = $seqchars[$pos]; if ( !( $char =~ /[$alphabet]/ ) ) { $newchars[$pos] = $char; } } $string = join( '', @newchars ); } $num_recs_processed++; # write the resulting semi-random string to the output file print OUTPUT "> entry $num_recs_processed\n$string\n\n"; } print STDOUT " $num_recs_processed records processed\n";