#!/usr/bin/perl -w # Perl script: NTcomposition.pl # Authors: Jun Song, Trisha Dwyer, Sadeesh K. Ramakrishnan, Sasi Arunachalam, # Tom Wittenschlaeger, Jason M. Bechtel & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # written April, 2007 # updated and expanded July, 2007 by Jason M. Bechtel # Contact alexei.fedorov@utoledo.edu # # LICENSE: GPL version 3 or later # If you use this code or incorporate it in another program, please cite our paper: # Bechtel JM, Wittenschlaeger T, Dwyer T, Song J, Arunachalam S, Ramakrishnan SK, Shepard S, Fedorov A: Genomic mid-range inhomogeneity correlates with an abundance of RNA secondary structures, BMC Genomics 2008, 9:284. # # This Perl script does the following: # - Section 1 creates all possible oligonucleotide strings up to $N-mers and initializes variables # - Section 2 reads the input file (in FASTA format), gets the nucleotide sequences, # and counts their oligonucleotide composition, keeping the counts in associative # arrays: %nt1 for single nucleotides; %nt2 for dinucleotides; ... %ntN for N-mers # - Section 3 counts the total number of oligonucleotides for each associative array %ntN # - Section 4 calculates the frequency of each oligonucleotide in the associative array # - Section 5 prints the frequency of each oligonucleotide to the output file # o This output is suitable for generating sequences using "prog_rand_seq". # - Section 6 prints the most and least common oligonucleotides to another output file # This version of the script attempts to handle non-canonical positions (eg: X,N,R,Y) rationally. # It identifies the overall canonical and non-canonical positions, splits up each sequence at any # non-canonical positions/runs, and then analyses the resulting sub-sequences. The overall # canonical vs. non-canonical content is added to the main output file and a warning is written # to STDERR. # # TODO: Handle *all* non-canonical "letters" (eg: '.', 'n', etc.) # if (scalar(@ARGV) != 3) { print STDOUT "Usage: NTcomposition.pl \n"; print STDOUT "\twhere is the name of a file containing sequences to be analyzed (in FASTA format),\n"; print STDOUT "\t is the name of a file to hold the output,\n"; print STDOUT "\tand is the longest oligonucleotide size to be analyzed (1 to 9)\n"; print STDOUT "\n"; exit 1; } $infile = $ARGV[0]; # name of file containing sequences to analyze (in FASTA format) $outfile = $ARGV[1]; # name of file to write resulting oligonucleotide frequencies to $outfile2 = $ARGV[1] . ".tbl"; # name of file to write resulting oligonucleotide frequencies to $N = int( $ARGV[2] ); # longest oligonucleotide size to appear in the output table # Check that $N is an integer between 1 and 9 (10 and more take a lot of memory) if ( $N != $ARGV[2] ) { die " > Error: Invalid oligonucleotide length [$ARGV[2]]; must be an integer (and 9 >= N >= 2)\n"; } if ( $N < 1 || $N > 9 ) { die "Invalid oligonucleotide length [$ARGV[2]]; (9 >= N >= 2)\n"; } # flush on every write select(STDOUT); $| = 1; # Section 1 creates all possible oligonucleotide strings up to $N-mers and initializes variables # Define the alphabet $alphabet = "ATCG"; print STDOUT "Initializing..."; @letters = split( '', $alphabet ); # Generate all possible n-mers in this alphabet... @new = @letters; for $n ( 1 .. $N ) { @{ 'NT' . $n } = @new; @new = (); foreach $x (@letters) { foreach $y ( @{ 'NT' . $n } ) { push( @new, ( $x . $y ) ); } } } # Initialize variables for $n ( 1 .. $N ) { ${ 'total' . $n } = 0; # total # of oligos counted for each oligo size %{ 'nt' . $n } = (); # oligo counts for a given oligo size %{ 'ntf' . $n } = (); # oligo frequencies for a given oligo size foreach $oligo (@{ 'NT' . $n }) { ${ 'nt' . $n }{ $oligo } = 0; ${ 'ntf' . $n }{ $oligo } = 0; } } $total_len = 0; $total_noncanon = 0; $overall_pct_noncanon = 0; print STDOUT " done\n"; # Section 2 reads the input file (in FASTA format), gets the nucleotide sequences, # and counts their oligonucleotide composition, keeping the counts in associative # arrays: %nt1 for single nucleotides; %nt2 for dinucleotides; ... %ntN for N-mers $/ = "\n>"; open( INFILE, "$infile" ) || die "Can't open \"$infile\": $!\n"; open( OUTPUT, ">$outfile" ) || die "Can't open \"$outfile\": $!\n"; print STDOUT "Reading in sequences from \"$infile\"..."; $num_recs_read = 0; $num_recs_processed = 0; while () { $num_recs_read++; $seq = ''; @lines = split( "\n", $_ ); chomp( @lines ); chomp( @lines ); if ($num_recs_read == 1) { if ($lines[0] !~ /^>/) { die " does not appear to be a FASTA-formatted file\n"; } } for $z ( 1 .. $#lines ) { # skip FASTA format comment lines if ( $lines[$z] =~ /^[>;]/ ) { next; } # convert to all-uppercase $lines[$z] = uc($lines[$z]); # strip out all non-letters # NOOOO! There are particular accepted forms of punctuation in genomic sequences # and we want to characterize their presence. --jmb, 7/17/07 #$lines[$z] =~ s/[^A-Z]//g; # strip out all white-space $lines[$z] =~ s/\s//g; $seq .= $lines[$z]; } $length = length($seq); $total_len += $length; @seqs = (); # added this reinit. to avoid rescanning seqs on empty records ~ jmb, 4/7/09 if ( $length > 0 ) { $full_length = length($seq); # remove non-canonical bases (uncertain positions, punctuation, etc.) @seqs = ( $seq ); $seq =~ s/[^$alphabet]//g; # recalculate length $length = length($seq); $num_noncanon = $full_length - $length; $total_noncanon += $num_noncanon; #$pct_noncanon = $num_noncanon / $full_length * 100; if ( $num_noncanon > 0 ) { @seqs = split( /[^$alphabet]+/, $seqs[0] ); } } # for each sub-sequence (if non-canonical characters caused the sequence to be split) foreach $seq ( @seqs ) { $length = length($seq); if ( $length > 0 ) { for $n ( 1 .. $N ) { if ($length >= $n) { # count all oligos of length $n for $x ( 0 .. ( $length - $n ) ) { # NOT "$length - $n + 1"! (7/16/07 --jmb) # substr offsets start at zero (0)! ${ 'nt' . $n }{ substr( $seq, $x, $n ) }++; } } } } } $num_recs_processed++; } close (INFILE); print STDOUT " $num_recs_read records found\n"; if ($num_recs_read == 0) { die "No FASTA records in input file [\"$infile\"]!\nAborting...\n"; } if ( $total_noncanon > 0 ) { $overall_pct_noncanon = $total_noncanon / $total_len * 100; printf STDERR " > Warning: %d non-canonical positions found comprising %.2f%% of bulk sequence data\n\n", $total_noncanon, $overall_pct_noncanon; } print STDOUT "Calculating oligonucleotide frequencies..."; # Section 3 counts the total number of oligonucleotides for each associative array %ntN for $n ( 1 .. $N ) { foreach $mer ( @{ 'NT' . $n } ) { ${ 'total' . $n } += ${ 'nt' . $n }{$mer}; } } # Section 4 calculates the frequency of each oligonucleotide in the associative array for $n ( 1 .. $N ) { $z = $n + 2; # decimal places of precision for this N-mer level if ( ${ 'total' . $n } == 0 ) { foreach $mer ( @{ 'NT' . $n } ) { ${ 'ntf' . $n }{$mer} = sprintf( "%.${z}f", 0 ); } } else { foreach $mer ( @{ 'NT' . $n } ) { ${ 'ntf' . $n }{$mer} = sprintf( "%.${z}f", ${ 'nt' . $n }{$mer} / ${ 'total' . $n } ); } } } print STDOUT " done\n"; print STDOUT "Writing frequencies to \"$outfile\"..."; # Section 5 prints the frequency of each oligonucleotide to the output file for $n ( 1 .. $N ) { @mers = @{ 'NT' . $n }; foreach $mer ( 0 .. $#mers ) { print OUTPUT $mers[$mer], "\t", ${ 'ntf' . $n }{ $mers[$mer] }, "\t", ${ 'nt' . $n }{ $mers[$mer] }, "\n"; } if ( $n == 1 && $total_noncanon > 0 ) { $total_canon = ( $total_len - $total_noncanon ); $overall_pct_canon = $total_canon / $total_len * 100; $overall_pct_noncanon = $total_noncanon / $total_len * 100; #printf OUTPUT "[%s]\t%.3f\t%d\n", $alphabet, $overall_pct_canon / 100, $total_canon; #printf OUTPUT "[^%s]\t%.3f\t%d\n", $alphabet, $overall_pct_noncanon / 100, $total_noncanon; printf OUTPUT "[%s]\t%.3f\t%d\n", "N", $overall_pct_canon / 100, $total_canon; printf OUTPUT "[!%s]\t%.3f\t%d\n", "N", $overall_pct_noncanon / 100, $total_noncanon; } print OUTPUT "\n\n"; } close (OUTPUT); print STDOUT " done\n"; print STDOUT "Writing most- and least-common n-mers table to \"$outfile2\"..."; # Section 6 prints the most and least common oligonucleotides to another output file open( OUTPUT, ">$outfile2" ) || die "Can't open \"$outfile2\": $!\n"; print OUTPUT " Most- and Least-common N-mers\n\n"; print OUTPUT "\tmost\tleast\n"; for $n ( 1 .. $N ) { #%freqs = %{ 'ntf' . $n }; # sort the hash of frequencies by value #@sorted = sort { $freqs{$a} <=> $freqs{$b} } keys %freqs; # sorting takes too much memory for 9-mers and up... just traverse the hash %counts = %{ 'nt' . $n }; $max_count = -1; @max_idxs = (); $min_count = -1; @min_idxs = (); while(($mer, $count) = each(%counts)) { # grab the highest- and lowest-count n-mers if ($count > $max_count || $max_count == -1) { @max_idxs = (); push(@max_idxs, $mer); $max_count = $count; } # track all oligos with max/min count elsif ($count == $max_count) { push(@max_idxs, $mer); } if ($count < $min_count || $min_count == -1) { @min_idxs = (); push(@min_idxs, $mer); $min_count = $count; } elsif ($count == $min_count) { push(@min_idxs, $mer); } } print OUTPUT "$n-mers"; print OUTPUT "\t", $counts{ $max_idxs[0] }; if (scalar(@max_idxs) >= 1) { print OUTPUT "/"; print OUTPUT join(',', @max_idxs); } print OUTPUT "\t", $counts{ $min_idxs[0] }; if (scalar(@min_idxs) >= 1) { print OUTPUT "/"; print OUTPUT join(',', @min_idxs); } print OUTPUT "\n"; } close (OUTPUT); print STDOUT " done\n";