#!/usr/bin/perl -w # Serge Saxonov, Iraj Daizadeh, Alexei Fedorov and Walter Gilbert # Department of Molecular and Cellular Biology # Harvard University # Cambridge, MA 02138 # Reference: Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert # Nucleic Acids Res. 2000 Jan 1;28(1):185-190 #SPL spl notes #SPL spl reduction in Sept 2004 #SPL ver 1.06 12.10.04 #SPL ver 1.07 13.10.04 #SPL ver 1.08 20.10.04 #SPL ver 1.09 21.10.04 #SPL ver 1.10 22.10.04 #SPL ver 1.11 01.11.04 #SPL ver 1.12 08.11.04 #SPL ver 1.13 11.11.04 #SPL ver 1.14 15.11.04 #SPL ver 1.15 21.11.04 #SPL ver 1.16 26.11.04 #SPL ver 1.17 28.11.04 #SPL ver 1.18 02.12.04 #SPL ver 1.19 11.12.04 #SPL ver 1.20 17.12.04 #SPL ver 1.21 23.12.04 #SPL ver 1.22 23.12.04 #SPL ver 1.23 15.01.05 #SPL ver 1.24 15.01.05 #SPL ver 1.25 15.09.05 #SPL ver 1.26 18.09.05 #SPL ver 1.27 29.10.05 #SPL ver 1.28 07.17.05 #SPL ver 1.29 09.11.05 #SPL ver 1.30 09.11.05 #SPL ver 1.31 23.11.05 #SPL ver 1.32 29.12.05 #SPL ver 1.33 02.01.06 #SPL ver 1.34 08.04.07 #SPL ver 1.35 26.07.07 my $txt_file_style=1;# =0 DOS-like style; =1 UNIX-like style #SPL my $eol_str; #SPL my $c_ver='1.35'; if ($txt_file_style==0) {$eol_str="\r\n";} else {$eol_str="\n";} #SPL # global error counters my $tran_err_id_num=0; #SPL number of CDS with in-frame stop codons my $tran_stop_id_num=0; #SPL number of CDS with in-frame stop codons additional my $dir_err_num=0; #invalid mRNA vs CDS direction my $init_codon_err=0; #SPL CDS doesn't starts with ATG my $total_intron_c=0; my $alt_intron_c=0; my $total_intron_in_entr_stop_cod_c=0; my $alt_intron_in_entr_stop_cod_c=0; my $undef_intron_c=0; my $explicit_n_undef_intron_c=0; #a20 my $rna_exons_c=0; my $dna_exons_c=0; my $rna_introns_c=0; my $aux_name='gun_ZZ__'; #SPL aux file to store gunzipped sq file my $UTR_AMBI_flg=0; my $UTR_AMBI_note='/SPL_note="mRNA assignment controversial"'; my $UTR_NF_flg=0; my $UTR_NF_note='/SPLnote="DUMMY mRNA_NF"'; my $translate_err=0; my $translate_flg=0; my $translate_err_stop_cod_c=0; my $expl_UTR_NF_note_c=0; my $expl_UTR_NF_c=0; my $no_protein_id_c=0; my $no_protein_id_mRNA_NF_c=0; my $CDS_incomplete_flg=0; my $CDS_incmpl_c=0; my $mRNA_incomplete_flg=0; my $mRNA_incmpl_c=0; my $last_gene_num=0; my $number_of_gene_with_stop_codons=0; my $all_CDS_with_stop_codons_flg=1; my $some_CDS_with_invalid_codons_flg=0; my $number_of_gene_with_invalid_codons=0; my $c_in_gene_intron_c_max=0; #a20 my $total_intron_num_gene1=0; #a20 my $c_in_gene_exon_c_max=0; #a20 my $total_exon_num_gene1=0; #a20 my $c_in_gene_nonc_intron_c_max=0; #a20 my $total_nonc_intron_num_gene1=0; #a20 total number of non-canonical introns # for AS only form with max number of non-canonical introns counts my $c_in_gene_nonc_atac_intron_c_max=0; #a20 my $total_nonc_atac_intron_num_gene1=0; #a20 my $c_in_gene_eshort_intron_c_max=0; #a21 my $total_eshort_intron_num_gene1=0; #a21 my $c_in_gene_elong_intron_c_max=0; #a21 my $total_elong_intron_num_gene1=0; #a21 my $c_in_gene_undef_intron_c_max=0; #nov7 # .. introns my $total_undef_intron_num_gene1=0; #nov7 my $pseudo_flg=0; ## finalmake.pl ## The heart of the database making process. ($#ARGV != -1) || die "Usage: finalmake.pl prefix\n"; $prefix = $ARGV[0]; open(SEQFILES, "SEQFILES.LIS") || die;#SPL ORI open(SEQFILES, "seqfiles.list") || die; open(IN,"${prefix}.RAC") || die;#SPL ORI open(IN,"${prefix}.RAWchecked") || die; open(DNAS, ">${prefix}.dEID") || die; open(PROTS, ">${prefix}.pEID") || die; open(HEADERS, ">${prefix}.hEID") || die; open(TRACEF, ">>${prefix}.tEID") || die; #SPL trace file already in use open(STATF, ">>${prefix}.sEID") || die; #a20 ## first read in the entire gb raw file while($line = ){ if($line =~ /^\$ID/){ ## first take care of the previous entry if(defined($locus)){ #print $cur_entry,"\n"; #SPL #&press_any_key; #SPL push(@{$all_loci{$locus}}, $cur_entry); # foreach $enay (@{$all_loci{$locus}}) # { print substr($enay,1,320),"\n"; #SPL this is a reference # } # SPL note here is side effect if given $locus is present twice # the data file ??? The sequence of ID is destroyed } ## now start another one $cur_entry = $line; $line = ; ## the LOCUS line $cur_entry .= $line; ($locus) = ($line =~ /^LOCUS\s+(\S+)/); # print "$locus\n";&press_any_key; if(!defined($locus)){ die "Couldn't resolve the LOCUS line in $cur_entry\n"; } } else{ $cur_entry .= $line; ## the code within this next block is responsible for memorizing all ## the references in the cdsjoin lines if($line =~ /^ CDS join|^ CDS complement\(join/){ $cdsline = $line; $parenind = 0; $parenind += ($cdsline =~ tr/\(/\(/); $parenind -= ($cdsline =~ tr/\)/\)/); while($parenind != 0){ $line = ; (defined $line) || die "DIED with parenind = $parenind\n"; $cur_entry .= $line; $parenind += ($line =~ tr/\(/\(/); $parenind -= ($line =~ tr/\)/\)/); $cdsline .= $line; } @codes = ($cdsline =~ /([A-Z01-9\.]*):/g); foreach $code (@codes){ $allcodes{$code} = 1; } } } } #print $cur_entry,"\n"; #SPL # print 'in final', $eol_str;&press_any_key; #SPL #print $locus,"\n"; #SPL push(@{$all_loci{$locus}}, $cur_entry); #print $all_loci{$locus}; #SPL this is a reference #&press_any_key; #SPL ## now get all the references while($infile = ){ $infile =~ s/\s+//g; my $infile_to_push=$infile; my @c_name=split(/\./,$infile); #SPL if (substr($infile,0,1) ne '#') { #SPL if (uc($c_name[-1]) eq 'GZ') { #SPL $aux_name='gun_ZZ__'; #SPL &trace("2 gunzip -c $infile > $aux_name",0); #SPL print ("2 gunzip -c $infile > $aux_name\n"); #SPL system("gunzip -c $infile > $aux_name"); #SPL $infile=$aux_name; #SPL } push(@seqfiles, $infile_to_push); open(SEQ, "$infile") || die; while($line = ){ if($line =~ /^LOCUS/) { $locusline = $line; next; } ($line =~ /^VERSION/) || next; ($code) = ($line =~ /^VERSION\s+(\S+)/); #print $code; #SPL #&press_any_key; #SPL $code || die "$line"; $code1 = $code; ($code2 = $code) =~ s/\.\d+$//; ## we need code2 b/c some cdsjoins neglect to mention the version no #print $code2; #SPL #print %allcodes; #SPL #&press_any_key; #SPL if(!exists($allcodes{$code1}) && !exists($allcodes{$code2})){ next; } while($line = ){ ($line !~ /^LOCUS/) || die "$code:\n$line"; if($line =~ /^ORIGIN/){ last; } } $entry = ""; $line = ; while($line !~ /^\/\//){ $line =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZ/ancnnngnnnnnnnnnnnnttnnnnn/; #EMBL-GB # /abcdefghijklmnopqrstuvwxyz/; $line =~ s/[^a-z]//g; $entry .= $line; $line = ; } ($locusline =~ /\smRNA\s|\scDNA\s/) ? ($isCoding = 1) : ($isCoding = 0); if(exists($allcodes{$code1})){ $DNA_access_codes{$code1} = $entry; $cDNA_codes{$code1} = $isCoding; } if(exists($allcodes{$code2})){ $DNA_access_codes{$code2} = $entry; $cDNA_codes{$code2} = $isCoding; } } close SEQ; } #end if not comment line sq file SPL } undef %allcodes; ## scan all the LOCI DNAs and work with all the genes in the locus $c_g=0;#SPL see how many genes foreach $infile (@seqfiles){ @c_name=split(/\./,$infile); #SPL if (uc($c_name[-1]) eq 'GZ') { #SPL $aux_name='gun_ZZ__'; #SPL &trace("3 gunzip -c $infile > $aux_name",0); #SPL print ("3 gunzip -c $infile > $aux_name\n"); #SPL system("gunzip -c $infile > $aux_name"); #SPL $infile=$aux_name; #SPL } open(SEQ, $infile) || die; while($line = ){ ($line =~ /^LOCUS/) || next; ($locus) = ($line =~ /^LOCUS\s+(\S+)/); $locus || die "$line"; unless(exists $all_loci{$locus}){ next; } while($line = ){ ($line !~ /^LOCUS/) || die "$locus:\n$line"; if($line =~ /^ORIGIN/){ last; } } $entry = ""; $line = ; while($line !~ /^\/\//){ (defined($line)) || die "$infile\n$locus\n$entry\n"; $line =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZ/ancnnngnnnnnnnnnnnnttnnnnn/; #EMBL-GB # /abcdefghijklmnopqrstuvwxyz/; $line =~ s/[^a-z]//g; $entry .= $line; $line = ; } $DNA_access_codes{"current"} = $entry; foreach $gene (@{$all_loci{$locus}}){ $c_g++; #print "$c_g:"; #SPL &do_locus($gene); } } close SEQ; } &handle_last_gene_info; #a20 &trace("number of entries with invalid codons=$tran_err_id_num",0); #SPL &trace("number of entries with stop codons=$tran_stop_id_num",0); #SPL &trace("number of entries with invalid mRNA vs CDS directions=$dir_err_num",0); &trace("number of CDS starting with not ATG=$init_codon_err",0);#SPL &trace(" total number of introns=$total_intron_c",0);#SPL &trace(" number of introns in entries with stop codon=$total_intron_in_entr_stop_cod_c",0);#SPL &trace("number of non-canonical introns=$alt_intron_c",0);#SPL &trace("number of non-canonical introns in entries with stop codons=$alt_intron_in_entr_stop_cod_c",0);#SPL &trace("number of undefined intron ends=$undef_intron_c",0);#SPL &trace("including undefined 'n' intron ends=$explicit_n_undef_intron_c",0);#a20 &trace("Number of genes with undefined introns=$total_undef_intron_num_gene1",0); #nov7 &trace("number of RNA exons =$rna_exons_c",0);#SPL &trace("number of RNA introns=$rna_introns_c",0);#SPL &trace("number of CDS exons =$dna_exons_c",0);#SPL # no further &trace("old version translation errors=$translate_err",0);#SPL # no further &trace("tran err in stop_codon counter=$translate_err_stop_cod_c",0);#SPL &trace("explicit UTR_NF_note counter=$expl_UTR_NF_note_c",0); &trace("explicit UTR_NF counter=$expl_UTR_NF_c",0);#SPL &trace("no_protein_id_number=$no_protein_id_c",0); &trace("no_protein_id_number in mRNA_NF case=$no_protein_id_mRNA_NF_c",0); &trace("number of incomplete CDS= $CDS_incmpl_c",0); &trace("number of incomplete mRNA=$mRNA_incmpl_c",0); &trace("version $c_ver",0); close TRACEF; #SPL &trace_stat("",0); #a20 &trace_stat("B.Problematic genes:",0); #a20 &trace_stat("Number of genes with stop codons inside CDS (for genes",0); #SPL &trace_stat(" with alternative splicing only the case when all",0); #SPL &trace_stat(" isoforms have stop codons inside CDS counts)............ $number_of_gene_with_stop_codons",0); #SPL &trace_stat("Number of CDS starting not from ATG codon..................... $init_codon_err",0); #a20 &trace_stat("Number of genes with invalid/unidentified codons.............. $number_of_gene_with_invalid_codons",0); #SPL &trace_stat("",0); #a20 &trace_stat("C. Exons and introns:",0); #a20 &trace_stat("Total number of introns (for genes with AS only one isoform",0); #a20 &trace_stat(" with max number of introns counts)...................... $total_intron_num_gene1",0); #a20 &trace_stat("Total number of exons (for genes with AS only one isoform",0); #a20 &trace_stat(" with max number of exons counts)........................ $total_exon_num_gene1",0); #a20 &trace_stat("Number of non-canonical introns (non TG..AG; for genes",0); #a20 &trace_stat(" with AS only one isoform with max number of ",0); #a20 &trace_stat(" non-canonical introns counts)........................... $total_nonc_intron_num_gene1",0); #a20 &trace_stat("Number of (AT..AC) introns (for genes with AS only one",0); #a20 &trace_stat(" isoform with max number of AT..AC introns counts)....... $total_nonc_atac_intron_num_gene1",0); #a20 &trace_stat("Number of extra-short introns (<30 bp; for genes with AS only",0); #a21 &trace_stat(" one isoform with maximal number of extra-short",0); #a21 &trace_stat(" introns counts)......................................... $total_eshort_intron_num_gene1",0); #a21 &trace_stat("Number of extra-long introns (>100,000 bp; for genes",0); #a21 &trace_stat(" with AS only one isoform with maximal number of",0); #a21 &trace_stat(" extra-long introns counts).............................. $total_elong_intron_num_gene1",0); #a21 &trace_stat("Number of introns with unidentified ends...................... $undef_intron_c",0); #a21 close STATF; #a20 exit; #SPL sub do_locus{ my $gene_rec_end="_GENE_REC_END_\n"; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $aux_str=''; my $cur_entry = $_[0]; ($id) = ($cur_entry =~ /^\$ID\s+(\d+\w*)/); #new numbering system due to new spec #SPL ORI ($id) = ($cur_entry =~ /^\$ID\s+(\d+)/); ($c_gene_num) = ($cur_entry =~ /^\$ID\s+(\d+)/); #a20 # print substr($cur_entry,0,320); &press_any_key; if (($c_gene_num != $last_gene_num) && ($last_gene_num!=0)){ # push all about last gene read in if ($all_CDS_with_stop_codons_flg==1) { $number_of_gene_with_stop_codons++; } # end if all CDS with stop codons if ($some_CDS_with_invalid_codons_flg==1) { $number_of_gene_with_invalid_codons++; } # end if some CDS with invalid codon $total_intron_num_gene1=$total_intron_num_gene1+$c_in_gene_intron_c_max; #a20 $total_exon_num_gene1=$total_exon_num_gene1+$c_in_gene_exon_c_max; #a20 $total_nonc_intron_num_gene1=$total_nonc_intron_num_gene1+$c_in_gene_nonc_intron_c_max; #a20 $total_nonc_atac_intron_num_gene1=$total_nonc_atac_intron_num_gene1+$c_in_gene_nonc_atac_intron_c_max; #a20 $total_eshort_intron_num_gene1+=$c_in_gene_eshort_intron_c_max; #a21 $total_undef_intron_num_gene1+=$c_in_gene_undef_intron_c_max; #nov7 $total_elong_intron_num_gene1+=$c_in_gene_elong_intron_c_max; #a21 } #end if new gene #a20 # memorize last gene num if ($c_gene_num != $last_gene_num) {$last_gene_num=$c_gene_num; $all_CDS_with_stop_codons_flg=1; $some_CDS_with_invalid_codons_flg=0; $c_in_gene_intron_c_max=0; $c_in_gene_exon_c_max=0; $c_in_gene_nonc_intron_c_max=0; #a20 $c_in_gene_nonc_atac_intron_c_max=0; #a20 $c_in_gene_eshort_intron_c_max=0; #a21 $c_in_gene_undef_intron_c_max=0; #nov7 $c_in_gene_elong_intron_c_max=0; #a21 } $cur_entry =~ /\n_HEADER_END_\n/; $header = $` . $&; $cds_info = $'; # print substr($cds_info,0,1600); &press_any_key; $aux_str=$cds_info; #SPL $aux_str=~/$gene_rec_end/; #SPL $gene_info=$`; #SPL # print $gene_info; &press_any_key; $aux_str=$'; #SPL $aux_str=~/$mRNA_rec_end/; #SPL $mRNA_info=$`; #SPL # print $mRNA_info;&press_any_key; $cds_info = $'; #SPL # print $cds_info;&press_any_key; if($mRNA_info =~ /\s{21}$UTR_AMBI_note/) # it should not be used caret {$UTR_AMBI_flg=1; # print "UTR_AMBI found\n"; } else {$UTR_AMBI_flg=0} if($mRNA_info =~ /\s{21}$UTR_NF_note/) # it should not be used caret {$UTR_NF_flg=1;$expl_UTR_NF_note_c++;} else {$UTR_NF_flg=0} $header =~ s/^.*\n//; ($locusline,$defline) = ($header =~ /^(.*\n)(.*)\n/); ($locusline =~ /\smRNA\s|\scDNA\s/) ? ($isCoding = 1) : ($isCoding = 0); $cDNA_codes{"current"} = $isCoding; $defline = substr($defline, 11); $defline=&chop_if_chr($defline,"\r"); #SPL chop 0D in DOS-like input file &get_gene_mRNA_cds; #SPL ORI &get_cds; &trace("$c_g ${id} $defline $gene_name",0); print $defline,' ',$gene_name, "\n"; &make_cds_entry_GB149; #SPL ORI &make_cds_entry; } sub make_cds_entry{ $CDS_left_wing=999999999;#5000; #SPL $CDS_right_wing=999999999;#5000; #SPL my $mRNA_start=0; #SPL my $mRNa_end=0; #SPL my $cds_start=0; #SPL my $cds_end=0; #SPL my $cds_shift=0; #SPL my $mRNA_tail=0; #SPL my $cds_err=0; my $mRNA_err=0; my $mRNA_dir=0; my $cds_dir=0; my $aux; my $aux_skip_flg=0; my $last_rexon_len; my $left_intron_protrud=0; my $right_intron_protrud=0; $pseudo_flg=0; ($mRNA_start,$mRNA_end,$mRNA_dir,$mRNA_err)=&get_feature_start_end($mRNAline,2); #SPL if ($mRNA_err==1) {&trace("mRNA field format error",4)}; ($cds_start,$cds_end,$cds_dir,$cds_err)=&get_feature_start_end($cdsline,2); #SPL if ($cds_err==1) {&trace("CDS field format error",4)}; #SPL that is now working with pseudo left and right if complement if ($mRNA_dir!=$cds_dir) {&trace("mRNA direction doesn't match that of CDS",4); $dir_err_num++; return 1} $cds_shift=$cds_start-$mRNA_start; # print "cds_start=$cds_start mRNA_start=$mRNA_start cds_sh=$cds_shift\n"; # &press_any_key; if ($mRNA_dir==1) { $aux=$CDS_left_wing;$CDS_left_wing=$CDS_right_wing; $CDS_right_wing=$aux}; if ($cds_shift>$CDS_left_wing) {$cds_shift=$CDS_left_wing} #SPL $mRNA_tail=$mRNA_end-$cds_end; #SPL if ($mRNA_tail>$CDS_right_wing) {$mRNA_tail=$CDS_right_wing} #SPL my $c_mRNA_start=$mRNA_start; my @raw_exons = &exons_from_cds($mRNAline); ## essentially splits the line along commas my ($rawexon, $rna_exon); my (@rna_exons, @rexon_lens, @rna_introns, $splice_site, $splice_info, @splice_sites); my($i); # print "passed 1\n"; unless (($rna_exon, $rexon_len) = &get_exonDNA($raw_exons[0])){ return; } # print "passed 2\n"; push(@rna_exons, $rna_exon); push(@rexon_lens, $rexon_len); for($i = 1; $i <= $#raw_exons;$i++){ (($rna_exon, $rexon_len) = &get_exonDNA($raw_exons[$i])) || return; # print "passed 3 raw_exons=$#raw_exons"; push(@rna_exons, $rna_exon); push(@rexon_lens, $rexon_len); (($rna_intron, $splice_site) = &get_intronDNA($raw_exons[$i-1],$raw_exons[$i])) || return; # print "passed 4"; push(@splice_sites, $splice_site); push(@rna_introns, $rna_intron); # ??? } # print "@rexon_lens\n"; # trunc left if necessery $i=0; # print "$cds_start $cds_shift $c_mRNA_start $rexon_lens[$i]\n"; # print "passed 5\n"; # &press_any_key; $last_rexon_len=$rexon_lens[$i]; if (($cds_start-$cds_shift)<=($c_mRNA_start+$rexon_lens[$i]-1)) { $aux=$cds_start-$cds_shift-$c_mRNA_start; # print "aux=$aux\n";&press_any_key; $rna_exon=substr($rna_exons[$i],$aux); $rna_exons[$i]=$rna_exon; # print "old $rexon_lens[0]=$rexon_lens[$i]\n"; $rexon_lens[$i]= $rexon_lens[$i]-$aux; $aux_skip_flg=1; } else {shift @rna_exons; shift @rexon_lens;shift @raw_exons} # print "new rexon_lens[$i]=$rexon_lens[$i]"; # print "c_mRNA_start=$c_mRNA_start\n"; $c_mRNA_start+=$last_rexon_len; # print "c_mRNA_start=$c_mRNA_start\n"; if ($aux_skip_flg==0) { # for($i = 1; $i <= $#raw_exons;$i++){ while ($i == 0) { #exit due to shift arrays $last_rna_intron_len=length($rna_introns[$i]); if (($cds_start-$cds_shift)<=($c_mRNA_start+$last_rna_intron_len-1)) { $aux=$cds_start-$cds_shift-$c_mRNA_start; # print "$cds_start $cds_shift $c_mRNA_start $rexon_lens[$i]\n"; # print "aux=$aux\n";&press_any_key; $rna_intron=substr($rna_introns[$i],$aux); $rna_introns[$i]=$rna_intron; $left_intron_protrud=1; last; } else {shift @splice_sites;shift @rna_introns} $c_mRNA_start+=$last_rna_intron_len; $last_rexon_len=$rexon_lens[$i]; if (($cds_start-$cds_shift)<=($c_mRNA_start+$rexon_lens[$i]-1)) { $aux=$cds_start-$cds_shift-$c_mRNA_start; # print "i=$i aux=$aux\n";&press_any_key; $rna_exon=substr($rna_exons[$i],$aux); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$aux; # print $rexon_lens[$i],"\n";&press_any_key; last; } else {shift @rna_exons;shift @rexon_lens;shift @raw_exons} $c_mRNA_start+=$last_rexon_len; } #end for $i } #end if not skip # print ">>@rexon_lens<<\n"; # trunc right if necessery # if ($mRNA_tail>$CDS_right_wing) {$mRNA_tail=$CDS_right_wing} #SPL my $c_mRNA_end=$mRNA_end; $aux_skip_flg=0; $i=$#raw_exons; # print "$cds_end $mRNA_tail $c_mRNA_end $rexon_lens[$i] i=$i\n"; # print "passed 6 starting right wing\n"; # &press_any_key; $last_rexon_len=$rexon_lens[$i]; if (($cds_end+$mRNA_tail)>=($c_mRNA_end-$rexon_lens[$i]+1)) { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; # print "aux=$aux\n";&press_any_key; $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$aux); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$aux; $aux_skip_flg=1; } else {pop @rna_exons; pop @rexon_lens; pop @raw_exons} # print "c_mRNA_end=$c_mRNA_end\n"; $c_mRNA_end-=$last_rexon_len; # print "2c_mRNA_end=$c_mRNA_end\n"; $j=$#rna_introns; # we dont know how many elements in # @rna_introns at this point # print "3c_mRNA_end=$c_mRNA_end\n"; if ($aux_skip_flg==0) { for($i = $#raw_exons;$i>0;$i--){ print "4c_mRNA_end=$c_mRNA_end i=$i\n"; $last_rna_intron_len=length($rna_introns[$j]); if (($cds_end+$mRNA_tail)>=($c_mRNA_end-$last_rna_intron_len+1)) { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; # print "$cds_end $mRNA_tail $c_mRNA_end $rexon_lens[$i] i=$i\n"; # print "aux=$aux\n";&press_any_key; $rna_intron=substr($rna_introns[$j],0,length($rna_introns[$j])-$aux); $rna_introns[$j]=$rna_intron; $right_intron_protrud=1; last; } else {pop @splice_sites;pop @rna_introns; } $c_mRNA_end-=$last_rna_intron_len; # print "5c_mRNA_end=$c_mRNA_end last_rna_intron_len=$last_rna_intron_len\n"; $last_rexon_len=$rexon_lens[$i]; if (($cds_end-$mRNA_tail)>=($c_mRNA_end-$rexon_lens[$i]+1)) { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; # print "aux=$aux\n";&press_any_key; $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$aux); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$aux; last; } else {pop @rna_exons;pop @rexon_lens} # we dont need pop @raw_exons # since in for statement $#raw_exons $c_mRNA_end-=$last_rexon_len; $j--; } #end for $i } # get left description of pre-CDS intron phases if appropriate # and get lright description of post-CDS intron phases if appropriate my $dna_rec_start=$cds_start-$cds_shift; my $dna_rec_end=$cds_end+$mRNA_tail; my $c_pos=$dna_rec_start; my $left_wing_mRNA_phase_rec=''; $j=0; $i=0; if ($left_intron_protrud==1) {$c_pos+=length($rna_introns[$j++]); $left_wing_mRNA_phase_rec='u';} $c_pos+=$rexon_lens[$i++]; while ($c_pos<=$cds_start) { $left_wing_mRNA_phase_rec.='u'; $c_pos+=$rexon_lens[$i++]+length($rna_introns[$j++]); } $c_pos=$dna_rec_end; my $right_wing_mRNA_phase_rec=''; $j=$#rna_introns; $i=$#rexon_lens; if ($right_intron_protrud==1) {$c_pos-=length($rna_introns[$j]); $j--; $right_wing_mRNA_phase_rec='u'; } $c_pos-=$rexon_lens[$i--]; while ($c_pos>=$cds_end) { $right_wing_mRNA_phase_rec.='u'; # print "rexon_lens[$i]=$rexon_lens[$i]\n"; $c_pos-=$rexon_lens[$i--]+length($rna_introns[$j--]); } if ($mRNA_dir==0) {$out_CDS_start=$cds_shift+1} else {$out_CDS_start=$mRNA_tail+1}; # print "cds_shift=$cds_shift mRNA_tail=$mRNA_tail\n" ; if ($mRNA_dir==0) {$aux=$mRNA_tail} else {$aux=$cds_shift}; $out_CDS_end=$dna_rec_end-($dna_rec_start-1)-$aux; #print "left_ph=$left_wing_mRNA_phase_rec right_ph=$right_wing_mRNA_phase_rec\n"; #&press_any_key; #aux printing intermediate structure for debug # print "left=$left_intron_protrud right=$right_intron_protrud\n"; # $j=0; # if ($left_intron_protrud==1) {print length($rna_introns[$j]),' ' ;$j=1} # for ($i=0;$i<=$#rna_exons-1; $i++) # {print length($rna_exons[$i]),' ',length($rna_introns[$j++]),' ';} # print length($rna_exons[$#rna_exons]),' '; # if ($right_intron_protrud==1) {print length($rna_introns[$j]),' '}; # &press_any_key; if ($mRNA_dir==1) { #SPL ORI if($cdsline =~ /complement\(join/){ # ) @rexon_lens = reverse @rexon_lens; @splice_sites = reverse @splice_sites; foreach (@splice_sites){ $_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/; } @rna_exons = reverse(@rna_exons); foreach (@rna_exons){$_ = reverse $_;$_ =~ tr/ACTG[A-Z]/TGACN/;} @rna_introns = reverse(@rna_introns); foreach (@rna_introns){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} my $aux_s=$right_wing_mRNA_phase_rec; $right_wing_mRNA_phase_rec=$left_wing_mRNA_phase_rec; $left_wing_mRNA_phase_rec=$aux_s; #print "left_ph=$left_wing_mRNA_phase_rec right_ph=$right_wing_mRNA_phase_rec\n"; $aux=$right_intron_protrud;$right_intron_protrud=$left_intron_protrud; $left_intron_protrud=$aux; #&press_any_key; } #SPL counting intron number my $in_entry_intron_c=0; #a20 my $in_entry_exon_c=1; #a20 my $in_entry_nonc_intron_c=0; #a20 my $c_splice_site=''; #a20 my $in_entry_nonc_atac_intron_c=0; #a20 foreach $c_splice_site (@splice_sites){ $total_intron_c++; $in_entry_intron_c++; #a20 $in_entry_exon_c++; #a20 if (($c_splice_site eq 'EEEE') or ($c_splice_site=~/n|N/)) {$undef_intron_c++; #a21 &trace("invalid_splice_site=$c_splice_site",4); } elsif ($c_splice_site ne 'gtag') {$alt_intron_c++; $in_entry_nonc_intron_c++; #a20 #a21 &trace("splice_site=$c_splice_site",4); } if ($c_splice_site=~/n/) {$explicit_n_undef_intron_c++} #a20 if ($c_splice_site eq 'atac') {$in_entry_nonc_atac_intron_c++ } #a20 } if ($in_entry_intron_c >=$c_in_gene_intron_c_max) {$c_in_gene_intron_c_max=$in_entry_intron_c} #a20 if ($in_entry_exon_c >=$c_in_gene_exon_c_max) {$c_in_gene_exon_c_max=$in_entry_exon_c} #a20 if ($in_entry_nonc_intron_c >=$c_in_gene_nonc_intron_c_max) {$c_in_gene_nonc_intron_c_max=$in_entry_nonc_intron_c} #a20 if ($in_entry_nonc_atac_intron_c >=$c_in_gene_nonc_atac_intron_c_max) {$c_in_gene_nonc_atac_intron_c_max=$in_entry_nonc_atac_intron_c} #a20 # handle extra-short and extra-long introns #a21 my $in_entry_eshort_intron_c=0; #a21 my $in_entry_elong_intron_c=0; #a21 my $c_i_len=0; foreach (@rna_introns){ $c_i_len=length($_); if ($c_i_len<30) {$in_entry_eshort_intron_c++} # if ($c_i_len<30) {&trace ("c_i_len=$c_i_len",0)} if ($c_i_len>100000) {$in_entry_elong_intron_c++} } #end for each intron a21 if ($in_entry_eshort_intron_c>$c_in_gene_eshort_intron_c_max) {$c_in_gene_eshort_intron_c_max=$in_entry_eshort_intron_c} if ($in_entry_elong_intron_c>$c_in_gene_elong_intron_c_max) {$c_in_gene_elong_intron_c_max=$in_entry_elong_intron_c} # end handle extra-short and extra-long introns #a21 foreach (@rna_exons){$rna_exons_c++} foreach (@rna_introns){$rna_introns_c++} @raw_exons = &exons_from_cds($cdsline); ## essentially splits the line along commas my (@dna_exons, @exon_lens, @dna_introns); # print ">>>@raw_exons<<< $#raw_exons\n"; unless (($dna_exon, $exon_len) = &get_exonDNA($raw_exons[0])){ return; } push(@dna_exons, $dna_exon); push(@exon_lens, $exon_len); for($i = 1; $i <= $#raw_exons;$i++){ (($dna_exon, $exon_len) = &get_exonDNA($raw_exons[$i])) || return; push(@dna_exons, $dna_exon); push(@exon_lens, $exon_len); # (($dna_intron, $splice_site) = &get_intronDNA($raw_exons[$i-1],$raw_exons[$i])) || # return; # push(@splice_sites, $splice_site); # push(@dna_introns, $dna_intron); } if($cdsline =~ /complement\(join/){ # ) @exon_lens = reverse @exon_lens; # @splice_sites = reverse @splice_sites; # foreach (@splice_sites){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} @dna_exons = reverse(@dna_exons); foreach (@dna_exons){$_ = reverse $_;$_ =~ tr/ACTG[A-Z]/TGACN/;} # @dna_introns = reverse(@dna_introns); # foreach (@dna_introns){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} } # print "!!!@exon_lens!!!\n"; $out_CDS_len=0;foreach (@exon_lens){$out_CDS_len+=$_}; #SPL foreach (@dna_exons){$dna_exons_c++} $splice_info = join("\,",@splice_sites); #print $splice_info,"\n"; #SPL my $aux_sum=0; for (my $k=0;$k<=$#dna_exons;$k++) { $aux_sum+=length($dna_exons[$k]); # print length($dna_exons[$k]),' ',$aux_sum,' ',$aux_sum/3, "\n"; } #&press_any_key; #SPL $CDS_sq=&get_CDS_sq(\@dna_exons); #SPL &init_gene_code; #SPL $translation=''; my $extra_cod=(length($CDS_sq)-$codon_start)%3; # if ($gene_name eq 'PIM2') { # print "$CDS_sq\n"; # print length($CDS_sq),"\n"; # print "codon_start=$codon_start\n"; # print "extra_cod=$extra_cod\n"; # &press_any_key; # } $translate_err_flg=0; if ($extra_cod!=0) { my $aux_e_cod=substr($CDS_sq,length($CDS_sq)-$extra_cod,$extra_cod); # if ($gene_name eq 'PIM2') {print "$aux_e_cod\n";&press_any_key;} if (($aux_e_cod ne 'AA') && ($aux_e_cod ne 'AG') && ($aux_e_cod ne 'GA')) { $translate_err++; $translate_err_flg=1; #no further &trace("old vers tran err in ".$gene_name,4) } } while ($extra_cod>0) {chop($CDS_sq);$extra_cod--} my $aux_code=&translate1(substr($CDS_sq,$codon_start)); #SPL # if ($gene_name eq 'PIM2') {print "$translation\n";&press_any_key;} # if ($aux_code!=0) {$err_code=11} #err code #SPL if ($aux_code!=0) {&trace("in-frame invalid codon",4); $tran_err_id_num++; $some_CDS_with_invalid_codons_flg=1; } #SPL $aux_code=&init_codon(substr($CDS_sq,$codon_start,3)); #SPL if ($aux_code!=0) {&trace("not starting with init codon",4); $init_codon_err++} #SPL my $stop_code=&check_trunc_translate; #SPL if ($stop_code!=0) {&trace("in-frame stop codon",4); $tran_stop_id_num++;} #SPL if ($stop_code ==0) {$all_CDS_with_stop_codons_flg=0} #a20 if (($stop_code!=0) && ($translate_err_flg ==1)) {$translate_err_stop_cod_c++} #SPL counting intron number in bad CDS if ($stop_code!=0) { $c_splice_site=''; foreach $c_splice_site (@splice_sites){ $total_intron_in_entr_stop_cod_c++; if (($c_splice_site eq 'EEEE') or ($c_splice_site =~/N|n/)) {} #a21 correction elsif ($c_splice_site ne 'gtag') {$alt_intron_in_entr_stop_cod_c++} }} # print $translation,"\n"; # print "CDS_len=",length($CDS_sq)," tran_len=",length($translation),"\n"; # &press_any_key; substr($dna_exons[0], 0, $codon_start) = ""; $exon_lens[0] -= $codon_start; ($phases, $p_exonlens, $p_intr_pos) = &get_prot_info_via_tran(\@exon_lens); #SPL ORI ($phases, $p_exonlens, $p_intr_pos) = &get_prot_info(\@exon_lens); ## this also changes $translation my $dphases=$left_wing_mRNA_phase_rec.$phases.$right_wing_mRNA_phase_rec; #SPL correct splice_info due to u symbols in phases. $i=0; my @psplice_sites=(); # print $#splice_sites+1,' ', length($dphases),"\n";&press_any_key; foreach( @splice_sites) {if (substr($dphases,$i++,1) ne 'u'){ push @psplice_sites,$_; } } $psplice_info = join("\,",@psplice_sites); # print "phases=$phases, p_exonlens=$p_exonlens, p_intr_pos=$p_intr_pos\n"; # &press_any_key; ($d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA) = &make_DNA_via_rna(\@rna_exons, \@rna_introns,$left_intron_protrud,$right_intron_protrud); #SPL ORI ($d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA) = &make_DNA(\@dna_exons, \@dna_introns); # print $d_exon_sizes,"\n"; # &press_any_key; if ($cds_info =~ /\n \/pseudo/) {$pseudo_flg=1} #s15 ($prot_id) = ($cds_info =~ /\n \/protein_id=\"(.*)\"/); if (defined($prot_id)) {} else {&trace("no protein_id", 4); $no_protein_id_c++; if ($UTR_NF_flg!=0) {$no_protein_id_mRNA_NF_c++} #return } #SPL ORI defined($prot_id) || return; # defined($prot_id) || die "No protein id: $locus, $cds_info\n"; $defline.=" /gene=\"$gene_name\""; #SPL adding gene name to header ## now we can print #temp see all: #SPL if ($stop_code==0) { #SPL &print_header($phases, $p_exonlens, $p_intr_pos, $psplice_info); #SPL ORI &print_header($phases, $p_exonlens, $p_intr_pos, $splice_info); &print_dna($dphases, $d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA, $splice_info,$stop_code); #SPL ORI &print_dna($phases, $d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA, $splice_info); &print_prots($phases, $p_exonlens, $p_intr_pos, $psplice_info); #SPL ORI &print_prots($phases, $p_exonlens, $p_intr_pos, $splice_info); } #end if ($stop_code==0) } sub make_cds_entry_GB149{ #special variant to debug gb149 cross-references $CDS_left_wing=999999999;#5000; #SPL $CDS_right_wing=999999999;#5000; #SPL my $mRNA_start=0; #SPL my $mRNa_end=0; #SPL my $cds_start=0; #SPL my $cds_end=0; #SPL my $cds_shift=0; #SPL my $mRNA_tail=0; #SPL my $cds_err=0; my $mRNA_err=0; my $mRNA_dir=0; my $cds_dir=0; my $aux; my $aux_skip_flg=0; my $last_rexon_len; my $left_intron_protrud=0; my $right_intron_protrud=0; my $mRNA_line_sh=''; my $mRMA_line_ve=''; my $cds_line_sh=''; my $cds_line_ve=''; my @cds_raw_exons; $pseudo_flg=0; ($mRNA_start,$mRNA_end,$mRNA_dir,$mRNA_err)=&get_feature_start_end($mRNAline,2); #SPL if ($mRNA_err==1) {&trace("mRNA field format error",4)}; ($cds_start,$cds_end,$cds_dir,$cds_err)=&get_feature_start_end($cdsline,2); #SPL if ($cds_err==1) {&trace("CDS field format error",4)}; # here only dir are used further # print "$mRNAline\n";&press_any_key; #SPL that is now working with pseudo left and right if complement if ($mRNA_dir!=$cds_dir) {&trace("mRNA direction doesn't match that of CDS",4); $dir_err_num++; return 1} # print "mRNA_dir=$mRNA_dir cds_dir=$cds_dir\n"; # &press_any_key; if ($mRNA_dir==1) { $aux=$CDS_left_wing;$CDS_left_wing=$CDS_right_wing; $CDS_right_wing=$aux}; my @raw_exons = &exons_from_cds_GB149($mRNAline); ## essentially splits the line along commas my ($rawexon, $rna_exon); my (@rna_exons, @rexon_lens, @rna_introns, $splice_site, $splice_info, @splice_sites); my($i); # if ($gene_name eq 'SET') {print "passed 1\n"; # &press_any_key;} unless (($rna_exon, $rexon_len) = &get_exonDNA($raw_exons[0])){ return; } # print "passed 2\n"; push(@rna_exons, $rna_exon); push(@rexon_lens, $rexon_len); for($i = 1; $i <= $#raw_exons;$i++){ (($rna_exon, $rexon_len) = &get_exonDNA($raw_exons[$i])) || return; # print length($rna_exon),"\n"; # print "passed 3 raw_exons=$#raw_exons"; push(@rna_exons, $rna_exon); push(@rexon_lens, $rexon_len); (($rna_intron, $splice_site) = &get_intronDNA_GB149($raw_exons[$i-1],$raw_exons[$i])) || return; push(@splice_sites, $splice_site); push(@rna_introns, $rna_intron); # ??? } # print "@rexon_lens\n"; # print "passed 4"; &press_any_key; #nov4 calc left-UTR and right-UTR @cds_raw_exons = &exons_from_cds_GB149($cdsline); ## essentially splits the line along commas my $c_left_wing=0; my $c_right_wing=0; my $in_CDS_flg=0; my $c_e=0; my $c_cds_e=0; my $cds_starting=0; my $cds_ending=0; my $cds_start_exon_ind=0; # null-based, numbering order as in FT my $cds_end_exon_ind=$#raw_exons; # null_based, numbering order as in FT $cds_end_exon_ind=$#cds_raw_exons; # dec2912 my ($rna_exon_bi_ref,$rna_exon_len,$rna_exon_complement); my ($cds_exon_bi_ref,$cds_exon_len,$cds_exon_complement); while (($c_e<=$#raw_exons)&&($in_CDS_flg==0)) { ($rna_exon_bi_ref,$rna_exon_len,$rna_exon_complement)= &get_exon_bi_ref($raw_exons[$c_e]); ($cds_exon_bi_ref,$cds_exon_len,$cds_exon_complement)= &get_exon_bi_ref($cds_raw_exons[$c_cds_e]); $cds_starting=&get_cds_starting($rna_exon_bi_ref,$rna_exon_complement, $cds_exon_bi_ref,$cds_exon_complement); # print "left_w: $rna_exon_bi_ref --- $cds_exon_bi_ref $cds_starting\n"; if ($cds_starting) {$c_left_wing+=($rna_exon_len-$cds_exon_len); $in_CDS_flg=1;$cds_start_exon_ind=$c_e;$c_cds_e++} else {$c_left_wing+=$rna_exon_len} $c_e++; } #end while raw_exons left wing while (($c_e<=$#raw_exons)&&($in_CDS_flg==1)) { # print "c_e=$c_e c_cds_e=$c_cds_e total_cds_exons=$#cds_raw_exons\n"; ($rna_exon_bi_ref,$rna_exon_len,$rna_exon_complement)= &get_exon_bi_ref($raw_exons[$c_e]); ($cds_exon_bi_ref,$cds_exon_len,$cds_exon_complement)= &get_exon_bi_ref($cds_raw_exons[$c_cds_e]); $cds_ending=&get_cds_ending($rna_exon_bi_ref,$rna_exon_complement, $cds_exon_bi_ref,$cds_exon_complement, $c_cds_e,$#cds_raw_exons); if ($cds_ending) {$c_right_wing+=($rna_exon_len-$cds_exon_len); $in_CDS_flg=0;$cds_end_exon_ind=$c_e} else {$c_cds_e++} $c_e++; } #end while raw_exons CDS overlap while ($c_e<=$#raw_exons) { ($rna_exon_bi_ref,$rna_exon_len,$rna_exon_complement)= &get_exon_bi_ref($raw_exons[$c_e]); $c_right_wing+=$rna_exon_len; $c_e++; } #end while raw_exons right wing # print "l_w=$c_left_wing r_w=$c_right_wing e_st=$cds_start_exon_ind e_end=$cds_end_exon_ind \n"; # &press_any_key; #nov4 incorporation of introns if ($cds_start_exon_ind>0) { for($i = 1; $i <= $cds_start_exon_ind;$i++) {if ($rna_introns[$i-1] ne '..') {$c_left_wing+=length($rna_introns[$i-1])}} } #end left wing if ($cds_end_exon_ind<$#raw_exons) { for($i = $cds_end_exon_ind+1; $i <=$#raw_exons ;$i++) {if ($rna_introns[$i-1] ne '..') {$c_right_wing+=length($rna_introns[$i-1])}} } #end right wing # print "l_w=$c_left_wing r_w=$c_right_wing + introns \n"; # &press_any_key; # nov 5 trunc left if necessery my $extra=0; if ($c_left_wing>$CDS_left_wing) {$extra=$c_left_wing-$CDS_left_wing} my $extra_left=$c_left_wing; if ($c_left_wing>$CDS_left_wing) {$extra_left=$CDS_left_wing} if ($extra>0) { $i=0; $last_rexon_len=$rexon_lens[$i]; if (($extra)<($rexon_lens[$i])) { $rna_exon=substr($rna_exons[$i],$extra); $rna_exons[$i]=$rna_exon; # print "old $rexon_lens[0]=$rexon_lens[$i]\n"; $rexon_lens[$i]= $rexon_lens[$i]-$extra; $aux_skip_flg=1; } else {shift @rna_exons; shift @rexon_lens;shift @raw_exons; $cds_start_exon_ind--;$cds_end_exon_ind--} # both ind decremented due to shift # print "new rexon_lens[$i]=$rexon_lens[$i]"; # print "c_mRNA_start=$c_mRNA_start\n"; $extra-=$last_rexon_len; # print "c_mRNA_start=$c_mRNA_start\n"; if ($aux_skip_flg==0) { # for($i = 1; $i <= $#raw_exons;$i++){ while ($i == 0) { #exit due to shift arrays $last_rna_intron_len=0; if ($rna_introns[$i] ne '..') { $last_rna_intron_len=length($rna_introns[$i]); } # end if good intron if (($extra)<($last_rna_intron_len)) { $rna_intron=substr($rna_introns[$i],$extra); $rna_introns[$i]=$rna_intron; $left_intron_protrud=1; last; } else {shift @splice_sites;shift @rna_introns} $extra-=$last_rna_intron_len; $last_rexon_len=$rexon_lens[$i]; if (($extra)<($rexon_lens[$i])) { $rna_exon=substr($rna_exons[$i],$extra); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$extra; # print $rexon_lens[$i],"\n";&press_any_key; last; } else {shift @rna_exons;shift @rexon_lens;shift @raw_exons; $cds_start_exon_ind--;$cds_end_exon_ind--} $extra-=$last_rexon_len; } #end for $i } #end if not skip # print ">>@rexon_lens<<\n"; } # end do trunc left # print "passed 5\n"; &press_any_key; # # trunc left if necessery # $i=0; ## print "$cds_start $cds_shift $c_mRNA_start $rexon_lens[$i]\n"; ## print "passed 5\n"; ## &press_any_key; # $last_rexon_len=$rexon_lens[$i]; # if (($cds_start-$cds_shift)<=($c_mRNA_start+$rexon_lens[$i]-1)) # { $aux=$cds_start-$cds_shift-$c_mRNA_start; ## print "aux=$aux\n";&press_any_key; # $rna_exon=substr($rna_exons[$i],$aux); # $rna_exons[$i]=$rna_exon; ## print "old $rexon_lens[0]=$rexon_lens[$i]\n"; # $rexon_lens[$i]= $rexon_lens[$i]-$aux; # $aux_skip_flg=1; # } else {shift @rna_exons; shift @rexon_lens;shift @raw_exons} ## print "new rexon_lens[$i]=$rexon_lens[$i]"; ## print "c_mRNA_start=$c_mRNA_start\n"; # $c_mRNA_start+=$last_rexon_len; ## print "c_mRNA_start=$c_mRNA_start\n"; # if ($aux_skip_flg==0) # { ## for($i = 1; $i <= $#raw_exons;$i++){ # while ($i == 0) { #exit due to shift arrays # $last_rna_intron_len=length($rna_introns[$i]); # if (($cds_start-$cds_shift)<=($c_mRNA_start+$last_rna_intron_len-1)) # { $aux=$cds_start-$cds_shift-$c_mRNA_start; ## print "$cds_start $cds_shift $c_mRNA_start $rexon_lens[$i]\n"; ## print "aux=$aux\n";&press_any_key; # $rna_intron=substr($rna_introns[$i],$aux); # $rna_introns[$i]=$rna_intron; # $left_intron_protrud=1; # last; # } else {shift @splice_sites;shift @rna_introns} # $c_mRNA_start+=$last_rna_intron_len; # $last_rexon_len=$rexon_lens[$i]; # if (($cds_start-$cds_shift)<=($c_mRNA_start+$rexon_lens[$i]-1)) # { $aux=$cds_start-$cds_shift-$c_mRNA_start; ## print "i=$i aux=$aux\n";&press_any_key; # $rna_exon=substr($rna_exons[$i],$aux); # $rna_exons[$i]=$rna_exon; # $rexon_lens[$i]= $rexon_lens[$i]-$aux; ## print $rexon_lens[$i],"\n";&press_any_key; # last; # } else {shift @rna_exons;shift @rexon_lens;shift @raw_exons} # $c_mRNA_start+=$last_rexon_len; # } #end for $i # } #end if not skip ## print ">>@rexon_lens<<\n"; # nov 5 trunc right if necessery $extra=0; if ($c_right_wing>$CDS_right_wing) {$extra=$c_right_wing-$CDS_right_wing} my $extra_right=$c_right_wing; if ($c_right_wing>$CDS_right_wing) {$extra_right=$CDS_right_wing} if ($extra>0) { $aux_skip_flg=0; $i=$#raw_exons; # &press_any_key; $last_rexon_len=$rexon_lens[$i]; if ($extra<$rexon_lens[$i]) { $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$extra); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$extra; $aux_skip_flg=1; } else {pop @rna_exons; pop @rexon_lens; pop @raw_exons} $extra-=$last_rexon_len; $j=$#rna_introns; # we dont know how many elements in # @rna_introns at this point if ($aux_skip_flg==0) { for($i = $#raw_exons;$i>0;$i--){ $last_rna_intron_len=length($rna_introns[$j]); if ($extra<$last_rna_intron_len) { $rna_intron=substr($rna_introns[$j],0,length($rna_introns[$j])-$extra); $rna_introns[$j]=$rna_intron; $right_intron_protrud=1; last; } else {pop @splice_sites;pop @rna_introns; } $extra-=$last_rna_intron_len; # print "5c_mRNA_end=$c_mRNA_end last_rna_intron_len=$last_rna_intron_len\n"; $last_rexon_len=$rexon_lens[$i]; if ($extra<$rexon_lens[$i]) { $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$extra); $rna_exons[$i]=$rna_exon; $rexon_lens[$i]= $rexon_lens[$i]-$extra; last; } else {pop @rna_exons;pop @rexon_lens} # we dont need pop @raw_exons # since in for statement $#raw_exons $extra-=$last_rexon_len; $j--; } #end for $i } } #end if $extra # print "passed 6\n"; &press_any_key; # # trunc right if necessery ## if ($mRNA_tail>$CDS_right_wing) {$mRNA_tail=$CDS_right_wing} #SPL # my $c_mRNA_end=$mRNA_end; # # $aux_skip_flg=0; # $i=$#raw_exons; ## print "$cds_end $mRNA_tail $c_mRNA_end $rexon_lens[$i] i=$i\n"; ## print "passed 6 starting right wing\n"; ## &press_any_key; # $last_rexon_len=$rexon_lens[$i]; # if (($cds_end+$mRNA_tail)>=($c_mRNA_end-$rexon_lens[$i]+1)) # { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; ## print "aux=$aux\n";&press_any_key; # $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$aux); # $rna_exons[$i]=$rna_exon; # $rexon_lens[$i]= $rexon_lens[$i]-$aux; # $aux_skip_flg=1; # } else {pop @rna_exons; pop @rexon_lens; pop @raw_exons} ## print "c_mRNA_end=$c_mRNA_end\n"; # $c_mRNA_end-=$last_rexon_len; ## print "2c_mRNA_end=$c_mRNA_end\n"; # $j=$#rna_introns; # we dont know how many elements in # # @rna_introns at this point ## print "3c_mRNA_end=$c_mRNA_end\n"; # if ($aux_skip_flg==0) # { # for($i = $#raw_exons;$i>0;$i--){ # print "4c_mRNA_end=$c_mRNA_end i=$i\n"; # $last_rna_intron_len=length($rna_introns[$j]); # if (($cds_end+$mRNA_tail)>=($c_mRNA_end-$last_rna_intron_len+1)) # { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; ## print "$cds_end $mRNA_tail $c_mRNA_end $rexon_lens[$i] i=$i\n"; ## print "aux=$aux\n";&press_any_key; # $rna_intron=substr($rna_introns[$j],0,length($rna_introns[$j])-$aux); # $rna_introns[$j]=$rna_intron; # $right_intron_protrud=1; # last; # } else {pop @splice_sites;pop @rna_introns; } # $c_mRNA_end-=$last_rna_intron_len; ## print "5c_mRNA_end=$c_mRNA_end last_rna_intron_len=$last_rna_intron_len\n"; # $last_rexon_len=$rexon_lens[$i]; # if (($cds_end-$mRNA_tail)>=($c_mRNA_end-$rexon_lens[$i]+1)) # { $aux=-$cds_end-$mRNA_tail+$c_mRNA_end; ## print "aux=$aux\n";&press_any_key; # $rna_exon=substr($rna_exons[$i],0,$rexon_lens[$i]-$aux); # $rna_exons[$i]=$rna_exon; # $rexon_lens[$i]= $rexon_lens[$i]-$aux; # last; # } else {pop @rna_exons;pop @rexon_lens} # we dont need pop @raw_exons # # since in for statement $#raw_exons # $c_mRNA_end-=$last_rexon_len; # $j--; # } #end for $i # } # nov6 # get left description of pre-CDS intron phases if appropriate # and get right description of post-CDS intron phases if appropriate my $dna_rec_start=$cds_start-$cds_shift; my $dna_rec_end=$cds_end+$mRNA_tail; my $c_pos=0; my $left_wing_mRNA_phase_rec=''; my $left_add_undef_intron_2nuc_corr=0; my $right_add_undef_intron_2nuc_corr=0; $j=0; $i=0; if ($left_intron_protrud==1) {if ($rna_introns[$j] ne '..') {$c_pos+=length($rna_introns[$j++]);} else {$left_add_undef_intron_2nuc_corr= $left_add_undef_intron_2nuc_corr+2; $j++; } $left_wing_mRNA_phase_rec='u';} $c_pos+=$rexon_lens[$i++]; while ($c_pos<$extra_left) { #??? # print "c_pos=$c_pos extra_l=$extra_left\n";&press_any_key; $left_wing_mRNA_phase_rec.='u'; $c_pos+=$rexon_lens[$i++]; if ($rna_introns[$j] ne '..') {$c_pos+=length($rna_introns[$j++]);} else {$left_add_undef_intron_2nuc_corr= $left_add_undef_intron_2nuc_corr+2;$j++} } $c_pos=0; my $right_wing_mRNA_phase_rec=''; $j=$#rna_introns; $i=$#rexon_lens; # print "introns=$j exons=$i\n"; &press_any_key; if ($right_intron_protrud==1) {if ($rna_introns[$j] ne '..') {$c_pos+=length($rna_introns[$j]); $j--;} else {$right_add_undef_intron_2nuc_corr= $right_add_undef_intron_2nuc_corr+2; $$j--; } $right_wing_mRNA_phase_rec='u'; } # print "2---- introns=$j exons=$i\n"; &press_any_key; $c_pos+=$rexon_lens[$i--]; while ($c_pos<$extra_right) { $right_wing_mRNA_phase_rec.='u'; # print "rexon_lens[$i]=$rexon_lens[$i]\n"; $c_pos+=$rexon_lens[$i--]; if ($rna_introns[$j] ne '..') {$c_pos+=length($rna_introns[$j--]);} else {$right_add_undef_intron_2nuc_corr= $right_add_undef_intron_2nuc_corr+2;$j--} } if ($mRNA_dir==0) {$out_CDS_start=$extra_left+1+$left_add_undef_intron_2nuc_corr} else {$out_CDS_start=$extra_right+1+$right_add_undef_intron_2nuc_corr}; # print "cds_shift=$cds_shift mRNA_tail=$mRNA_tail\n" ; # if ($mRNA_dir==0) {$aux=$mRNA_tail} else {$aux=$cds_shift}; # $out_CDS_end calc see later after $CDS_intr_len; # # get left description of pre-CDS intron phases if appropriate # # and get right description of post-CDS intron phases if appropriate # my $dna_rec_start=$cds_start-$cds_shift; # my $dna_rec_end=$cds_end+$mRNA_tail; # my $c_pos=$dna_rec_start; # my $left_wing_mRNA_phase_rec=''; # $j=0; $i=0; # if ($left_intron_protrud==1) {$c_pos+=length($rna_introns[$j++]); # $left_wing_mRNA_phase_rec='u';} # $c_pos+=$rexon_lens[$i++]; # while ($c_pos<=$cds_start) { # $left_wing_mRNA_phase_rec.='u'; # $c_pos+=$rexon_lens[$i++]+length($rna_introns[$j++]); # } # $c_pos=$dna_rec_end; # my $right_wing_mRNA_phase_rec=''; # $j=$#rna_introns; $i=$#rexon_lens; # if ($right_intron_protrud==1) {$c_pos-=length($rna_introns[$j]); $j--; # $right_wing_mRNA_phase_rec='u'; # } # $c_pos-=$rexon_lens[$i--]; # while ($c_pos>=$cds_end) { # $right_wing_mRNA_phase_rec.='u'; ## print "rexon_lens[$i]=$rexon_lens[$i]\n"; # $c_pos-=$rexon_lens[$i--]+length($rna_introns[$j--]); # } # if ($mRNA_dir==0) {$out_CDS_start=$cds_shift+1} # else {$out_CDS_start=$mRNA_tail+1}; ## print "cds_shift=$cds_shift mRNA_tail=$mRNA_tail\n" ; # if ($mRNA_dir==0) {$aux=$mRNA_tail} else {$aux=$cds_shift}; # $out_CDS_end=$dna_rec_end-($dna_rec_start-1)-$aux; # #print "left_ph=$left_wing_mRNA_phase_rec right_ph=$right_wing_mRNA_phase_rec\n"; #&press_any_key; #aux printing intermediate structure for debug # print "left=$left_intron_protrud right=$right_intron_protrud\n"; # $j=0; # if ($left_intron_protrud==1) {print length($rna_introns[$j]),' ' ;$j=1} # for ($i=0;$i<=$#rna_exons-1; $i++) # {print length($rna_exons[$i]),' ',length($rna_introns[$j++]),' ';} # print length($rna_exons[$#rna_exons]),' '; # if ($right_intron_protrud==1) {print length($rna_introns[$j]),' '}; # &press_any_key; if ($mRNA_dir==1) { #SPL ORI if($cdsline =~ /complement\(join/){ # ) @rexon_lens = reverse @rexon_lens; @splice_sites = reverse @splice_sites; foreach (@splice_sites){ $_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/; } @rna_exons = reverse(@rna_exons); foreach (@rna_exons){$_ = reverse $_;$_ =~ tr/ACTG[A-Z]/TGACN/;} @rna_introns = reverse(@rna_introns); foreach (@rna_introns){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} my $aux_s=$right_wing_mRNA_phase_rec; $right_wing_mRNA_phase_rec=$left_wing_mRNA_phase_rec; $left_wing_mRNA_phase_rec=$aux_s; #print "left_ph=$left_wing_mRNA_phase_rec right_ph=$right_wing_mRNA_phase_rec\n"; $aux=$right_intron_protrud;$right_intron_protrud=$left_intron_protrud; $left_intron_protrud=$aux; #&press_any_key; } #SPL counting intron number my $in_entry_intron_c=0; #a20 my $in_entry_exon_c=1; #a20 my $in_entry_nonc_intron_c=0; #a20 my $c_splice_site=''; #a20 my $in_entry_nonc_atac_intron_c=0; #a20 foreach $c_splice_site (@splice_sites){ $total_intron_c++; $in_entry_intron_c++; #a20 $in_entry_exon_c++; #a20 if (($c_splice_site eq 'EEEE') or ($c_splice_site=~/n|N/)) {$undef_intron_c++; #a21 &trace("invalid_splice_site=$c_splice_site",4); } elsif ($c_splice_site ne 'gtag') {$alt_intron_c++; $in_entry_nonc_intron_c++; #a20 #a21 &trace("splice_site=$c_splice_site",4); } if ($c_splice_site=~/n/) {$explicit_n_undef_intron_c++} #a20 if ($c_splice_site eq 'atac') {$in_entry_nonc_atac_intron_c++ } #a20 } if ($in_entry_intron_c >=$c_in_gene_intron_c_max) {$c_in_gene_intron_c_max=$in_entry_intron_c} #a20 if ($in_entry_exon_c >=$c_in_gene_exon_c_max) {$c_in_gene_exon_c_max=$in_entry_exon_c} #a20 if ($in_entry_nonc_intron_c >=$c_in_gene_nonc_intron_c_max) {$c_in_gene_nonc_intron_c_max=$in_entry_nonc_intron_c} #a20 if ($in_entry_nonc_atac_intron_c >=$c_in_gene_nonc_atac_intron_c_max) {$c_in_gene_nonc_atac_intron_c_max=$in_entry_nonc_atac_intron_c} #a20 # handle extra-short and extra-long introns #a21 my $in_entry_eshort_intron_c=0; #a21 my $in_entry_elong_intron_c=0; #a21 my $in_entry_undef_intron_c=0; #nov7 my $c_i_len=0; foreach (@rna_introns){ $c_i_len=length($_); if ($_ eq '..') {$in_entry_undef_intron_c++} else {if ($c_i_len<30) {$in_entry_eshort_intron_c++}} # if ($c_i_len<30) {&trace ("c_i_len=$c_i_len",0)} if ($c_i_len>100000) {$in_entry_elong_intron_c++} } #end for each intron a21 if ($in_entry_eshort_intron_c>$c_in_gene_eshort_intron_c_max) {$c_in_gene_eshort_intron_c_max=$in_entry_eshort_intron_c} if ($in_entry_undef_intron_c>$c_in_gene_undef_intron_c_max) {$c_in_gene_undef_intron_c_max=$in_entry_undef_intron_c} if ($in_entry_elong_intron_c>$c_in_gene_elong_intron_c_max) {$c_in_gene_elong_intron_c_max=$in_entry_elong_intron_c} # end handle extra-short and extra-long introns #a21 # print "passed 7\n"; &press_any_key; foreach (@rna_exons){$rna_exons_c++} foreach (@rna_introns){$rna_introns_c++} @raw_exons = &exons_from_cds($cdsline); ## essentially splits the line along commas my (@dna_exons, @exon_lens, @dna_introns); # print ">>>@raw_exons<<< $#raw_exons\n"; unless (($dna_exon, $exon_len) = &get_exonDNA($raw_exons[0])){ return; } push(@dna_exons, $dna_exon); push(@exon_lens, $exon_len); for($i = 1; $i <= $#raw_exons;$i++){ (($dna_exon, $exon_len) = &get_exonDNA($raw_exons[$i])) || return; push(@dna_exons, $dna_exon); push(@exon_lens, $exon_len); (($dna_intron, $splice_site) = &get_intronDNA_GB149($raw_exons[$i-1],$raw_exons[$i])) || return; # push(@splice_sites, $splice_site); push(@dna_introns, $dna_intron); } if($cdsline =~ /complement\(join/){ # ) @exon_lens = reverse @exon_lens; # @splice_sites = reverse @splice_sites; # foreach (@splice_sites){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} @dna_exons = reverse(@dna_exons); foreach (@dna_exons){$_ = reverse $_;$_ =~ tr/ACTG[A-Z]/TGACN/;} @dna_introns = reverse(@dna_introns); foreach (@dna_introns){$_ = reverse $_;$_ =~ tr/actg[a-z]/tgacn/;} } # print "!!!@exon_lens!!!\n"; $out_CDS_len=0;foreach (@exon_lens){$out_CDS_len+=$_}; #SPL my $CDS_intr_len=0;foreach (@exon_lens){$CDS_intr_len+=$_; # print "CDS_intr_len=$CDS_intr_len\n"; }; #SPL foreach (@dna_introns){$CDS_intr_len+=length($_); # print "CDS_intr_len=$CDS_intr_len\n"; }; #SPL #here we consider intron ='..' as having len=2 $out_CDS_end=$out_CDS_start+$CDS_intr_len-1; foreach (@dna_exons){$dna_exons_c++} $splice_info = join("\,",@splice_sites); #print $splice_info,"\n"; #SPL my $aux_sum=0; for (my $k=0;$k<=$#dna_exons;$k++) { $aux_sum+=length($dna_exons[$k]); # print length($dna_exons[$k]),' ',$aux_sum,' ',$aux_sum/3, "\n"; } #&press_any_key; #SPL $CDS_sq=&get_CDS_sq(\@dna_exons); #SPL &init_gene_code; #SPL $translation=''; if (not defined( $codon_start)) {$codon_start=0} my $extra_cod=(length($CDS_sq)-$codon_start)%3; # if ($gene_name eq 'PIM2') { # print "$CDS_sq\n"; # print length($CDS_sq),"\n"; # print "codon_start=$codon_start\n"; # print "extra_cod=$extra_cod\n"; # &press_any_key; # } $translate_err_flg=0; if ($extra_cod!=0) { my $aux_e_cod=substr($CDS_sq,length($CDS_sq)-$extra_cod,$extra_cod); # if ($gene_name eq 'PIM2') {print "$aux_e_cod\n";&press_any_key;} if (($aux_e_cod ne 'AA') && ($aux_e_cod ne 'AG') && ($aux_e_cod ne 'GA')) { $translate_err++; $translate_err_flg=1; #no further &trace("old vers tran err in ".$gene_name,4) } } while ($extra_cod>0) {chop($CDS_sq);$extra_cod--} my $aux_code=&translate1(substr($CDS_sq,$codon_start)); #SPL # if ($gene_name eq 'PIM2') {print "$translation\n";&press_any_key;} # if ($aux_code!=0) {$err_code=11} #err code #SPL if ($aux_code!=0) {&trace("in-frame invalid codon",4); $tran_err_id_num++; $some_CDS_with_invalid_codons_flg=1; } #SPL $aux_code=&init_codon(substr($CDS_sq,$codon_start,3)); #SPL if ($aux_code!=0) {&trace("not starting with init codon",4); $init_codon_err++} #SPL my $stop_code=&check_trunc_translate; #SPL if ($stop_code!=0) {&trace("in-frame stop codon",4); $tran_stop_id_num++;} #SPL if ($stop_code ==0) {$all_CDS_with_stop_codons_flg=0} #a20 if (($stop_code!=0) && ($translate_err_flg ==1)) {$translate_err_stop_cod_c++} #SPL counting intron number in bad CDS if ($stop_code!=0) { $c_splice_site=''; foreach $c_splice_site (@splice_sites){ $total_intron_in_entr_stop_cod_c++; if (($c_splice_site eq 'EEEE') or ($c_splice_site =~/N|n/)) {} #a21 correction elsif ($c_splice_site ne 'gtag') {$alt_intron_in_entr_stop_cod_c++} }} # print $translation,"\n"; # print "CDS_len=",length($CDS_sq)," tran_len=",length($translation),"\n"; # &press_any_key; substr($dna_exons[0], 0, $codon_start) = ""; $exon_lens[0] -= $codon_start; ($phases, $p_exonlens, $p_intr_pos) = &get_prot_info_via_tran(\@exon_lens); #SPL ORI ($phases, $p_exonlens, $p_intr_pos) = &get_prot_info(\@exon_lens); ## this also changes $translation my $dphases=$left_wing_mRNA_phase_rec.$phases.$right_wing_mRNA_phase_rec; #SPL correct splice_info due to u symbols in phases. $i=0; my @psplice_sites=(); # print $#splice_sites+1,' ', length($dphases),"\n";&press_any_key; foreach( @splice_sites) {if (substr($dphases,$i++,1) ne 'u'){ push @psplice_sites,$_; } } $psplice_info = join("\,",@psplice_sites); # print "phases=$phases, p_exonlens=$p_exonlens, p_intr_pos=$p_intr_pos\n"; # &press_any_key; ($d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA) = &make_DNA_via_rna(\@rna_exons, \@rna_introns,$left_intron_protrud,$right_intron_protrud); #SPL ORI ($d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA) = &make_DNA(\@dna_exons, \@dna_introns); # print $d_exon_sizes,"\n"; # &press_any_key; if ($cds_info =~ /\n \/pseudo/) {$pseudo_flg=1} #s15 ($prot_id) = ($cds_info =~ /\n \/protein_id=\"(.*)\"/); if (defined($prot_id)) {} else {&trace("no protein_id", 4); $no_protein_id_c++; if ($UTR_NF_flg!=0) {$no_protein_id_mRNA_NF_c++} #return } #SPL ORI defined($prot_id) || return; # defined($prot_id) || die "No protein id: $locus, $cds_info\n"; $defline.=" /gene=\"$gene_name\""; #SPL adding gene name to header ## now we can print #temp see all: #SPL if ($stop_code==0) { #SPL &print_header($phases, $p_exonlens, $p_intr_pos, $psplice_info); #SPL ORI &print_header($phases, $p_exonlens, $p_intr_pos, $splice_info); &print_dna($dphases, $d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA, $splice_info,$stop_code); #SPL ORI &print_dna($phases, $d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA, $splice_info); &print_prots($phases, $p_exonlens, $p_intr_pos, $psplice_info); #SPL ORI &print_prots($phases, $p_exonlens, $p_intr_pos, $splice_info); } #end if ($stop_code==0) } sub print_prots{ my ($phases, $p_exonlens, $p_intr_pos, $splice_info) = @_; my $exon_no = length($phases) + 1; my $exon_sum = length($translation); if (defined($prot_id)) {} else {$prot_id='none'} print PROTS "> ${id}_$locus protein_id:$prot_id; $defline; "; print PROTS "intron(phase:$phases,position:$p_intr_pos)\; exon(number:$exon_no,size:${p_exonlens}"; print PROTS "sum:$exon_sum); {splice:$splice_info}"; my $aux_p_line:=''; if ($CDS_incomplete_flg==1) {$aux_p_line=' CDS_incomplete'}; if ($pseudo_flg !=0) {$aux_p_line.=' PSEUDO'}; print PROTS "$aux_p_line\n"; &print_nicely("PROTS", $translation); print PROTS "\n"; } sub print_dna{ my ($phases, $d_intron_sizes, $d_exon_sizes, $d_i_sum, $d_e_sum, $DNA, $splice_info,$stop_code) = @_; if (defined($prot_id)) {} else {$prot_id='none'} #s15 print DNAS "> ${id}_$locus protein_id:$prot_id; $defline; "; print DNAS "intron(phase:$phases,size:${d_intron_sizes}intr_sum:$d_i_sum)\; "; print DNAS "exon(size:${d_exon_sizes}ex_sum:$d_e_sum); {splice:$splice_info}; "; #SPL ORI print DNAS "exon(size:${d_exon_sizes}ex_sum:$d_e_sum); {splice:$splice_info}\n"; print DNAS "CDS_start=$out_CDS_start, CDS_end=$out_CDS_end, CDS_len=$out_CDS_len";#SPL my $aux_p_line=''; if ($stop_code !=0) {$aux_p_line=' STOP_CODON'}; if ($pseudo_flg !=0) {$aux_p_line.=' PSEUDO'}; if ($UTR_AMBI_flg==1) {$aux_p_line.=' UTR_AMBI'}; if ($UTR_NF_flg!=0) {$aux_p_line.=' UTR_NF';$expl_UTR_NF_c++}; if ($CDS_incomplete_flg==1) {$aux_p_line.=' CDS_incomplete';$CDS_incmpl_c++}; if ($mRNA_incomplete_flg==1) {$aux_p_line.=' mRNA_incomplete';$mRNA_incmpl_c++}; print DNAS "$aux_p_line\n"; &print_nicely("DNAS", $DNA); print DNAS "\n"; } sub print_nicely{ my ($out, $string) = @_; $COL = 80; for(my $j = 0; $j < length($string); $j += $COL){ print $out substr($string, $j, $COL) . "\n"; } } sub print_header{ my ($phases, $p_exonlens, $p_intr_pos, $splice_info) = @_; my $exon_no = length($phases) + 1; my $exon_sum = length($translation); # print "$exon_sum\n"; #SPL # &press_any_key; #SPL if (defined($prot_id)) {} else {$prot_id='none'} #s15 print HEADERS "> ${id}_$locus protein_id:$prot_id; $defline;\n"; print HEADERS "intron(phase:$phases,position:$p_intr_pos)\; exon(number:$exon_no,size:${p_exonlens}"; print HEADERS "sum:$exon_sum); {splice:$splice_info}\n"; if ($CDS_incomplete_flg==1) {print HEADERS "CDS_incomplete\n"}; print HEADERS "$header"; $cds_info =~ s/\n\s*\n/\n/g; # compress the text a little print HEADERS "$cds_info//\n\n"; # the "//" will mark the end of an entry } sub make_DNA{ my @dna_exons = @{$_[0]}; my @dna_introns = @{$_[1]}; my ($intron_sizes, $exon_sizes, $i_sum, $e_sum, $DNA) = ("", "", 0, 0, ""); my ($i_len, $e_len, $i); ($#dna_exons == $#dna_introns + 1) || die "The number of introns does not match the number of exons in $locus\n$cds_info\n"; for($i = 0; $i < $#dna_exons; $i++){ $i_len = length $dna_introns[$i]; $e_len = length $dna_exons[$i]; $DNA .= $dna_exons[$i]; $DNA .= $dna_introns[$i]; if($dna_introns[$i] eq ".."){ $intron_sizes .= "u,"; $i_sum = "-1"; } else{ $intron_sizes .= "$i_len,"; ($i_sum == -1) || ($i_sum += $i_len); } $e_sum += $e_len; $exon_sizes .= "$e_len,"; } $DNA .= $dna_exons[$#dna_exons]; $e_len = length( $dna_exons[$#dna_exons]); $e_sum += $e_len; $exon_sizes .= "$e_len,"; # print "$e_sum\n"; #SPL # &press_any_key; #SPL return ($intron_sizes, $exon_sizes, $i_sum, $e_sum, $DNA); } sub make_DNA_via_rna{ #SPL made from make_DNA my @dna_exons = @{$_[0]}; my @dna_introns = @{$_[1]}; my $left_protr=$_[2]; my $right_protr=$_[3]; my ($intron_sizes, $exon_sizes, $i_sum, $e_sum, $DNA) = ("", "", 0, 0, ""); my ($i_len, $e_len, $i); my $j=0; if ($left_protr==1) {$i_len =length($dna_introns[$j]); $DNA .= $dna_introns[$j]; if($dna_introns[$j] eq ".."){ $intron_sizes .= "u,"; $i_sum = "-1"; } else{ $intron_sizes .= "$i_len,"; ($i_sum == -1) || ($i_sum += $i_len); } $j++; } for ($i=0;$i<=$#dna_exons-1; $i++) {$i_len = length $dna_introns[$j]; $e_len = length $dna_exons[$i]; $DNA .= $dna_exons[$i]; $DNA .= $dna_introns[$j]; if($dna_introns[$j] eq ".."){$intron_sizes .= "u,"; $i_sum = "-1";} else {$intron_sizes .= "$i_len,";($i_sum == -1) || ($i_sum += $i_len);} $j++; $e_sum += $e_len; $exon_sizes .= "$e_len,"; } $i=$#dna_exons; $e_len = length $dna_exons[$i]; $DNA .= $dna_exons[$i]; $e_sum += $e_len; $exon_sizes .= "$e_len,"; if ($right_protr==1) { $i_len = length $dna_introns[$j]; $DNA .= $dna_introns[$j]; if($dna_introns[$j] eq ".."){$intron_sizes .= "u,";$i_sum = "-1";} else{$intron_sizes .= "$i_len,";($i_sum == -1) || ($i_sum += $i_len);} }; # print "intron_sizes=$intron_sizes, exon_sizes=$exon_sizes, # i_sum=$i_sum, e_sum=$e_sum\n"; # &press_any_key; return ($intron_sizes, $exon_sizes, $i_sum, $e_sum, $DNA); } sub get_CDS_sq #(\@dna_exons) return $CDS_sq { my @dna_exons = @{$_[0]}; my $CDS_sq = ""; for(my $i = 0; $i <=$#dna_exons; $i++){ $CDS_sq .= $dna_exons[$i]; } # print $CDS_sq,"\n"; # &press_any_key; return $CDS_sq; } sub get_prot_info{ my @exon_lens = @{$_[0]}; my $phases = ""; my $intr_pos = ""; my $p_exon_lens = ""; my $n_ipos = 0; my ($p_intr_pos_new, $phase, $p_len ); my($translation); my $i=0; #SPL my declaration in one place my $p_intr_pos_old = 1; $transl_len = 0; for($i = 0; $i <= $#exon_lens; $i++){ $transl_len += $exon_lens[$i]; } #SPL ORI for($i = 0; $i < $#exon_lens; $i++){ #SPL ORI $transl_len += $exon_lens[$i]; #SPL ORI } #SPL this bug follows that last p_exon is always=1 $transl_len /= 3; $translation = ""; for($i = 0; $i < $transl_len; $i++){ $translation .= "Z"; } # print "@exon_lens"; #SPL # print "$transl_len\n"; #SPL # print "$translation\n"; #SPL # &press_any_key; #SPL for($i = 0; $i < $#exon_lens; $i++){ $n_ipos += $exon_lens[$i]; $phase = $n_ipos % 3; $phases .= "$phase"; $p_intr_pos_new = ($n_ipos - $phase) / 3 + 1; # print "$p_intr_pos_new\n"; #SPL $intr_pos .= "$p_intr_pos_new,"; if($p_intr_pos_new <= length($translation)){ substr($translation, $p_intr_pos_new - 1, 1) =~ tr/A-Z/a-z/; } $p_len = $p_intr_pos_new - $p_intr_pos_old; $p_exon_lens .= "$p_len,"; $p_intr_pos_old = $p_intr_pos_new; } $p_len = length($translation) + 1 - $p_intr_pos_old; $p_exon_lens .= "$p_len,"; chop($intr_pos); ## drop the last comma # print "$p_exon_lens\n"; return($phases, $p_exon_lens, $intr_pos); } sub get_prot_info_via_tran{ #SPL this new modification of get_prot_info sub to #SPL suppress a bug in translation length calculation #SPL (2) correction for the situation when last intron is in #SPL stop codon: virtually no via translation my @exon_lens = @{$_[0]}; my $phases = ""; my $intr_pos = ""; my $p_exon_lens = ""; my $n_ipos = 0; my ($p_intr_pos_new, $phase, $p_len ); # my($translation); my $i=0; #SPL my declaration in one place my $p_intr_pos_old = 1; $transl_len = 0; for($i = 0; $i <= $#exon_lens; $i++){ $transl_len += $exon_lens[$i]; } while (($transl_len %3) !=0) {$transl_len++} #SPL $transl_len /=3; # $translation = ""; # for($i = 0; $i < $transl_len; $i++){ # $translation .= "Z"; # } # if ($gene_name eq 'NUDT11') { # print "@exon_lens\n"; #SPL # print "$transl_len\n"; #SPL # print "$translation\n"; #SPL # &press_any_key; #SPL # } for($i = 0; $i < $#exon_lens; $i++){ $n_ipos += $exon_lens[$i]; $phase = $n_ipos % 3; $phases .= "$phase"; $p_intr_pos_new = ($n_ipos - $phase) / 3 + 1; # if ($gene_name eq 'NUDT11') { # print "$p_intr_pos_new\n"; #SPL # } $intr_pos .= "$p_intr_pos_new,"; # if($p_intr_pos_new <= length($translation)){ # substr($translation, $p_intr_pos_new - 1, 1) =~ tr/A-Z/a-z/; # } $p_len = $p_intr_pos_new - $p_intr_pos_old; $p_exon_lens .= "$p_len,"; $p_intr_pos_old = $p_intr_pos_new; } $p_len = $transl_len - $p_intr_pos_old; #SPL this is done to # get sum p_exon_lens #SPL ORI $p_len = $transl_len + 1 - $p_intr_pos_old; $p_exon_lens .= "$p_len,"; chop($intr_pos); ## drop the last comma return($phases, $p_exon_lens, $intr_pos); } sub exons_from_cds{ my ($start, $end); my $cds = $_[0]; $cds =~ s/\s+//g; $cds =~ s/CDSjoin\((.*)\)$/$1/; $cds =~ s/CDScomplement\(join\((.*)\)\)$/$1/g; # &print_cds_line($cds); #SPL # &press_any_key; #SPL return(split(/\,/, $cds)); } sub exons_from_cds_GB149{ # special variant for mRNA my $cds = $_[0]; $cds =~ s/\s+//g; if ($cds =~/^mRNA/) {$cds =~s/^mRNA/CDS/} $cds =~ s/CDSjoin\((.*)\)$/$1/; $cds =~ s/CDScomplement\(join\((.*)\)\)$/$1/g; # &print_cds_line($cds); #SPL # &press_any_key; #SPL return(split(/\,/, $cds)); } sub get_feature_line_sh{ # clear spaces of feature record my $c_ft =substr($_[0],21); $c_ft =~ s/\s+//g; return $c_ft; } sub get_feature_line_ve # get venter of the feature line # assume spaces deleted { my $cds = $_[0]; $cds =~ s/join\((.*)\)$/$1/; $cds =~ s/complement\(join\((.*)\)\)$/$1/g; # &print_cds_line($cds); #SPL # &press_any_key; #SPL return(split(/\,/, $cds)); } sub get_exonDNA{ my ($length, $exon, $positionsi, $start, $end); my $access ="current"; my $rawexon = $_[0]; my $complement = ($rawexon =~ s/^complement\((.*)\)$/$1/); # print $rawexon,"\n"; #SPL # &press_any_key; #SPL my @temp = split(/\:/, $rawexon); # print "temp>> @temp\n"; #SPL # &press_any_key; #SPL if($#temp == 1){ ($access, $positions) = @temp; } elsif($#temp == 0){ $positions = $temp[0]; } else{ die "Could not parse the cds info in\n$cds_info\nspecifically:$rawexon\nin $locus\n"; } # print "$access $DNA_access_codes{$access}\n"; # print "$access\n"; #ORI defined($DNA_access_codes{$access}) || return; #SPL if no sequence out respective NNN's ## now get the actual start and end ($start, $end) = ($positions =~ /\d+/g); if(!defined $end){ ## if there is only one nucleotide $end = $start; } $length = $end - $start + 1; # print "start=$start end=$end\n"; # &press_any_key; (defined($length) && $length > 0) || die "Could not parse the cds info in\n$cds_info\nspecifically:$rawexon\nin $locus\n"; if (defined($DNA_access_codes{$access})) { ($end <= length($DNA_access_codes{$access})) || die "One of the indices is out of range: $end in $rawexon(start is $start) in $locus\n$cds_info\n"; $exon = substr($DNA_access_codes{$access}, $start -1, $length); if(!defined($exon)){ die "Could not get an exon from \n$cds_info\nspecifically:$rawexon\nin $locus\n"; } } else {$exon=''; for ($j=1; $j<=$length; $j++) {$exon.='N'}} $exon = uc($exon); if($complement){ $exon =~ tr/ACTG[A-Z]/TGACN/; ##complement $exon = reverse($exon); } return($exon, $length); } sub get_intronDNA{ ### $isCoding && return("..","UUUU"); my ($rawexon1, $rawexon2) = ($_[0], $_[1]); my $intron = ".."; my ($access1, $access2) = ("current", "current"); my ($splice5, $splice3, $end1, $end2, $start1, $start2, $length); my ($positions1, $positions2); my $complement1 = ($rawexon1 =~ s/^complement\((.*)\)$/$1/); my $complement2 = ($rawexon2 =~ s/^complement\((.*)\)$/$1/); my @temp1 = split(/\:/, $rawexon1); my @temp2 = split(/\:/, $rawexon2); (defined($complement1) && defined($complement2)) || die "Complement error: $rawexon1, $rawexon2,\n$cds_info\n$locus\n"; if($#temp1 == 1){ ($access1, $positions1) = @temp1; } elsif($#temp1 == 0){ $positions1 = $temp1[0]; } if($#temp2 == 1){ ($access2, $positions2) = @temp2; } elsif($#temp2 == 0){ $positions2 = $temp2[0]; } ## now get the indices ($start1, $end1) = ($positions1 =~ /\d+/g); if(!defined $end1){ ## if there is only one nucleotide $end1 = $start1; } ($start2, $end2) = ($positions2 =~ /\d+/g); if(!defined $end2){ ## if there is only one nucleotide $end2 = $start2; } --$start1; --$end1; --$start2; --$end2; ##in genbank indexing starts at 1, not 0 ## get all the splicing info ####first the 5' site if($complement1){ if($start1 < 3){ $splice5 = "NN"; } else{ $splice5 = substr($DNA_access_codes{$access1}, $start1 - 2, 2); $splice5 =~ tr/actg[a-z]/tgacn/; $splice5 = reverse $splice5; } } else{ if($end1 + 2 > length($DNA_access_codes{$access1}) - 1){ $splice5 = "NN"; } else{ $splice5 = substr($DNA_access_codes{$access1}, $end1 + 1, 2); } } ####now the 3' site if($complement2){ if($end2 + 2 > length($DNA_access_codes{$access2}) - 1){ $splice3 = "NN"; } else{ $splice3 = substr($DNA_access_codes{$access2}, $end2 + 1, 2); $splice3 =~ tr/actg[a-z]/tgacn/; $splice3 = reverse $splice3; } } else{ if($start2 < 3){ $splice3 = "NN"; } else{ $splice3 = substr($DNA_access_codes{$access2}, $start2 - 2, 2); } } if(($access1 eq $access2) && ($complement1 eq $complement2)){ ## this is the only case where we can get an intron if($complement1){ ## _both_ are on the complementary strand $length = $start1 - $end2 - 1; if($length < 0){ ## can't get the intron } else{ $intron = substr($DNA_access_codes{$access1}, $end2 +1, $length); $intron =~ tr/actg[a-z]/tgacn/; $intron = reverse $intron; } } else{ $length = $start2 - $end1 -1; if($length < 0){ ## no intron } else{ $intron = substr($DNA_access_codes{$access1}, $end1 + 1, $length); } } ## need to test for intron length here if($length == 0){ $splice3 = $splice5 = "NN"; $intron = ".." } elsif($length < 4){ $splice3 = $splice5 = "EE"; } } if($cDNA_codes{$access1}){ $splice5 = "NN"; $intron = ".."; } if($cDNA_codes{$access2}){ $splice3 = "NN"; $intron = ".."; } return($intron, $splice5 . $splice3); } sub get_intronDNA_GB149{ ### $isCoding && return("..","UUUU"); my ($rawexon1, $rawexon2) = ($_[0], $_[1]); my $intron = ".."; my ($access1, $access2) = ("current", "current"); my ($splice5, $splice3, $end1, $end2, $start1, $start2, $length); my ($positions1, $positions2); my $complement1 = ($rawexon1 =~ s/^complement\((.*)\)$/$1/); my $complement2 = ($rawexon2 =~ s/^complement\((.*)\)$/$1/); my @temp1 = split(/\:/, $rawexon1); my @temp2 = split(/\:/, $rawexon2); (defined($complement1) && defined($complement2)) || die "Complement error: $rawexon1, $rawexon2,\n$cds_info\n$locus\n"; if($#temp1 == 1){ ($access1, $positions1) = @temp1; } elsif($#temp1 == 0){ $positions1 = $temp1[0]; } if($#temp2 == 1){ ($access2, $positions2) = @temp2; } elsif($#temp2 == 0){ $positions2 = $temp2[0]; } ## now get the indices ($start1, $end1) = ($positions1 =~ /\d+/g); if(!defined $end1){ ## if there is only one nucleotide $end1 = $start1; } ($start2, $end2) = ($positions2 =~ /\d+/g); if(!defined $end2){ ## if there is only one nucleotide $end2 = $start2; } --$start1; --$end1; --$start2; --$end2; ##in genbank indexing starts at 1, not 0 ## get all the splicing info ####first the 5' site if($complement1){ if($start1 < 3){ $splice5 = "NN"; } else{ if (defined($DNA_access_codes{$access1})) {$splice5 = substr($DNA_access_codes{$access1}, $start1 - 2, 2); $splice5 =~ tr/actg[a-z]/tgacn/; $splice5 = reverse $splice5; } else {$splice5 = "NN"} } } else{ if (defined($DNA_access_codes{$access1})) {if($end1 + 2 > length($DNA_access_codes{$access1}) - 1){ $splice5 = "NN"; } else{ $splice5 = substr($DNA_access_codes{$access1}, $end1 + 1, 2); } } #end if defined else {$splice5 = "NN"} } ####now the 3' site if($complement2){ if (defined($DNA_access_codes{$access2})) { if($end2 + 2 > length($DNA_access_codes{$access2}) - 1){ $splice3 = "NN"; } else{ $splice3 = substr($DNA_access_codes{$access2}, $end2 + 1, 2); $splice3 =~ tr/actg[a-z]/tgacn/; $splice3 = reverse $splice3; } } #end if defined else {$splice3 = "NN"} } else{ if (defined($DNA_access_codes{$access2})) { if($start2 < 3){ $splice3 = "NN"; } else{ $splice3 = substr($DNA_access_codes{$access2}, $start2 - 2, 2); } } #end if defined else {$splice3 = "NN"} } if(($access1 eq $access2) && ($complement1 eq $complement2)){ ## this is the only case where we can get an intron if($complement1){ ## _both_ are on the complementary strand $length = $start1 - $end2 - 1; if($length < 0){ ## can't get the intron } else{ if (defined($DNA_access_codes{$access1})) {$intron = substr($DNA_access_codes{$access1}, $end2 +1, $length); $intron =~ tr/actg[a-z]/tgacn/; $intron = reverse $intron; } else {$intron = ".."} } } else{ $length = $start2 - $end1 -1; if($length < 0){ ## no intron } else{ if (defined($DNA_access_codes{$access1})) {$intron = substr($DNA_access_codes{$access1}, $end1 + 1, $length);} else {$intron = ".."} } } ## need to test for intron length here if($length == 0){ $splice3 = $splice5 = "NN"; $intron = ".." } elsif($length < 4){ $splice3 = $splice5 = "EE"; } } if($cDNA_codes{$access1}){ $splice5 = "NN"; $intron = ".."; } if($cDNA_codes{$access2}){ $splice3 = "NN"; $intron = ".."; } #SPL # if (!defined($DNA_access_codes{$access1})) { # print "NN1: $access1 "; # $splice5 = "NN"; # $intron = ".."; # } # if (!defined($DNA_access_codes{$access2})) { #dir ??? # print "NN2: $access2\n"; # $splice3 = "NN"; # $intron = ".."; # } # print (substr($intron,0,10),'..',substr($intron,-10),' ', $splice5 . $splice3); &press_any_key; return($intron, $splice5 . $splice3); } sub get_exon_bi_ref # (rawexon) return($exon_bi_ref, $length,$complement) { # get exon as bi-reference # example AL012345:23..75 -> AL012345:23..AL012345:75 #made from get_exonDNA my ($length, $exon, $positionsi, $start, $end); my $exon_bi_ref=''; my $access ="current"; my $rawexon = $_[0]; # print "before>>\n",$rawexon,"\n"; #SPL my $complement = ($rawexon =~ s/^complement\((.*)\)$/$1/); # print $rawexon,"\n"; #SPL # &press_any_key; #SPL my @temp = split(/\:/, $rawexon); # print "temp>> @temp\n"; #SPL # &press_any_key; #SPL if($#temp == 1){ ($access, $positions) = @temp; } elsif($#temp == 0){ $positions = $temp[0]; } else{ die "Could not parse the cds info in\n$cds_info\nspecifically:$rawexon\nin $locus\n"; } # if ($gene_name eq'SET') {print "passed first die\n"; &press_any_key;} # print "$access $DNA_access_codes{$access}\n"; # print "$access\n"; #SPL defined($DNA_access_codes{$access}) || return; ## now get the actual start and end ($start, $end) = ($positions =~ /\d+/g); if(!defined $end){ ## if there is only one nucleotide $end = $start; } $length = $end - $start + 1; # print "start=$start end=$end\n"; # &press_any_key; (defined($length) && $length > 0) || die "Could not parse the cds info in\n$cds_info\nspecifically:$rawexon\nin $locus\n"; if (defined($DNA_access_codes{$access})) { ($end <= length($DNA_access_codes{$access})) || die "One of the indices is out of range: $end in $rawexon(start is $start) in $locus\n$cds_info\n"; } # if ($gene_name eq'SET') {print "passed 2nd die\n"; &press_any_key;} #SPL $exon = substr($DNA_access_codes{$access}, $start -1, $length); #SPL if(!defined($exon)){ #SPL die "Could not get an exon from \n$cds_info\nspecifically:$rawexon\nin $locus\n"; #SPL } $exon_bi_ref=$access.':'.$start.'..'.$access.':'.$end; #SPL $exon = uc($exon); #SPL if($complement){ #SPL $exon =~ tr/ACTG[A-Z]/TGACN/; ##complement #SPL $exon = reverse($exon); #SPL } # print "$exon $length\n"; if ($complement) {$complement=1} else {$complement=0} return($exon_bi_ref, $length,$complement); } sub get_cds_starting # $rna_exon_bi_ref,$rna_exon_complement, # $cds_exon_bi_ref,$cds_exon_complement; {my $c_ref_rna=''; my $c_ref_cds=''; my ($c_rna_exon_start,$c_rna_exon_end); my ($c_cds_exon_start,$c_cds_exon_end); my ($aux1,$aux2); my $ret_flg=0; $_[0]=~/\.\./; $aux1=$`;$aux2=$'; ($c_ref_rna,$c_rna_exon_start)=split /\:/,$aux1; ($c_ref_rna,$c_rna_exon_end)= split /\:/,$aux2; $_[2]=~/\.\./; $aux1=$`;$aux2=$'; ($c_ref_cds,$c_cds_exon_start)=split /\:/,$aux1; ($c_ref_cds,$c_cds_exon_end)=split /\:/,$aux2; # print "$c_ref_rna $c_rna_exon_end r_dir:$_[1] c_dir:$_[3]\n"; # &press_any_key; if ($c_ref_rna eq $c_ref_cds) { if (($_[1]==$_[3])&&($_[1]==0)) {if ($c_rna_exon_end==$c_cds_exon_end) {$ret_flg=1}} if (($_[1]==$_[3])&&($_[1]==1)) {if ($c_rna_exon_start==$c_cds_exon_start) {$ret_flg=1}} } #end if ref equ return $ret_flg } sub get_cds_ending # $rna_exon_bi_ref,$rna_exon_complement, # $cds_exon_bi_ref,$cds_exon_complement; # $c_cds_e, $#cds_raw_exons {my $c_ref_rna=''; my $c_ref_cds=''; my ($c_rna_exon_start,$c_rna_exon_end); my ($c_cds_exon_start,$c_cds_exon_end); my ($aux1,$aux2); my $ret_flg=0; $_[0]=~/\.\./; $aux1=$`;$aux2=$'; ($c_ref_rna,$c_rna_exon_start)=split /\:/,$aux1; ($c_ref_rna,$c_rna_exon_end)=split /\:/,$aux2; $_[2]=~/\.\./; $aux1=$`;$aux2=$'; ($c_ref_cds,$c_cds_exon_start)=split /\:/,$aux1; ($c_ref_cds,$c_cds_exon_end)=split /\:/,$aux2; if ($c_ref_rna eq $c_ref_cds) { if ($_[4]<=$_[5]) { if (($_[1]==$_[3])&&($_[1]==0)) { if (($c_rna_exon_start==$c_cds_exon_start) && ($c_rna_exon_end!=$c_cds_exon_end)) {$ret_flg=1} } if (($_[1]==$_[3])&&($_[1]==1)) { if (($c_rna_exon_end==$c_cds_exon_end) && ($c_rna_exon_start!=$c_cds_exon_start)) {$ret_flg=1} } } # end if ($c_cds_e<=$#cds_raw_exons) if ($_[4]==$_[5]) { #case when last cds exon equ rna exon if ($_[1]==$_[3]) { if (($c_rna_exon_start==$c_cds_exon_start) && ($c_rna_exon_end==$c_cds_exon_end)) {$ret_flg=1} } } } #end if ref equ return $ret_flg } sub get_cds { my $line; undef($codon_start); @entrylines = split(/\n/, $cds_info); my $i = 0; while($i <= $#entrylines){ if($entrylines[$i] =~ /CDS\s+join|CDS\s+complement\(join/){ #) $parenind = ($entrylines[$i] =~ tr/\(/\(/) - ($entrylines[$i] =~ tr/\)/\)/); $cdsline = $entrylines[$i]; while($parenind != 0){ $line = $entrylines[++$i]; $parenind += ($line =~ tr/\(/\(/); $parenind -= ($line =~ tr/\)/\)/); $cdsline .= $line; } #&print_cds_line($cdsline); #SPL #&press_any_key; #SPL while(++$i <= $#entrylines){ $line = $entrylines[$i]; if($line =~ /^ \/codon_start=(\d)/){ $codon_start = $1 - 1; $translation = "ZZZZ"; return 1; } elsif($line =~ /^ \/translation=/){ $translation = $line; $translation =~ s/\/translation=\"//; while($translation !~ /\"/){ $line = $entrylines[++$i]; $translation .= $line; } $translation =~ s/\s+//g; $translation =~ s/\"//; if(!defined($codon_start)){ die "No codon_start before translation:$locus\n$cdsline\n"; } return 1; } } return 0; } $i++; } return 0; } sub get_gene_mRNA_cds { #SPL new subroutine extracting gene name, mRNA record, CDS record my $line; my $op_res=($gene_info=~/\/(gene|locus_tag)=\"([\040-\176]+)\"/); #SPL # any printable symbol from ASCII if ($op_res) {$gene_name=$2} #EMBL_GB else {$op_res=($gene_info=~/\/(gene|locus_tag)=\s*([\041-\176]+)/); if ($op_res) {$gene_name=$2} else {$gene_name='';} } undef($codon_start); @entrylines = split(/\n/, $mRNA_info); my $i = 0; while($i <= $#entrylines){ if($entrylines[$i] =~ /^\s{5}mRNA\s{5}/){ $parenind = ($entrylines[$i] =~ tr/\(/\(/) - ($entrylines[$i] =~ tr/\)/\)/); $mRNAline = $entrylines[$i]; while($parenind != 0){ $line = $entrylines[++$i]; $parenind += ($line =~ tr/\(/\(/); $parenind -= ($line =~ tr/\)/\)/); $mRNAline .= $line; } } $i++; } substr($mRNAline,21)=~s/[\s\n]//g; $mRNAline=substr($mRNAline,0,21).&expand_rec_1nuc2(substr($mRNAline,21)); # &print_cds_line($mRNAline);&press_any_key; if ($mRNAline=~/[<>]/) {$mRNA_incomplete_flg=1} else {$mRNA_incomplete_flg=0} @entrylines = split(/\n/, $cds_info); $i = 0; while($i <= $#entrylines){ if($entrylines[$i] =~ /CDS\s+join|CDS\s+complement\(join/){ #) $parenind = ($entrylines[$i] =~ tr/\(/\(/) - ($entrylines[$i] =~ tr/\)/\)/); $cdsline = $entrylines[$i]; while($parenind != 0){ $line = $entrylines[++$i]; $parenind += ($line =~ tr/\(/\(/); $parenind -= ($line =~ tr/\)/\)/); $cdsline .= $line; } # &print_cds_line($cdsline); #SPL substr($cdsline,21)=~s/[\s\n]//g; $cdsline=substr($cdsline,0,21).&expand_rec_1nuc2(substr($cdsline,21)); #&print_cds_line($cdsline); #SPL #&press_any_key; #SPL if ($cdsline=~/[<>]/) {$CDS_incomplete_flg=1} else {$CDS_incomplete_flg=0} while(++$i <= $#entrylines){ $line = $entrylines[$i]; if($line =~ /^ \/codon_start=(\d)/){ $codon_start = $1 - 1; $translation = "ZZZZ"; return 1; } #SPL ORI elsif($line =~ /^ \/translation=/){ #SPL ORI $translation = $line; #SPL ORI $translation =~ s/\/translation=\"//; #SPL ORI while($translation !~ /\"/){ #SPL ORI $line = $entrylines[++$i]; #SPL ORI $translation .= $line; #SPL ORI } #SPL ORI $translation =~ s/\s+//g; #SPL ORI $translation =~ s/\"//; #SPL ORI if(!defined($codon_start)){ #SPL ORI die "No codon_start before translation:$locus\n$cdsline\n"; #SPL ORI } #SPL ORI return 1; #SPL ORI #SPL ORI } } return 0; } $i++; } return 0; } sub get_feature_start_end #($short_feature); return (start, end, err_code) #SPL new subroutine: extracting feature start and end #SPL no cross-reference allowed (no check) #SPL $_[1]=2: do not swap $c_start,$c_end {my $aux_short_feature=$_[0]; my $dir=0; $dir=($aux_short_feature =~/\s+complement\(/); #ORI #SPL EMBL-GB $dir=($aux_short_feature =~/\s*complement\(/); if ($dir!=1) {$dir=0} # print "aux_short_feature>>>\n"; # print "$aux_short_feature"; # print "<<]?\d+)/$1/g); my $i=0; while (my $c_res=($aux_short_feature=~/(\d+)\.\.[<>]?(\d+)/g)) {$i++; if ($i==1) {$c_start=$1} if ($i==$item_num) {$c_end=$2} # print "$i: $1 $2\n"; } # print "start=$c_start end=$c_end\n"; my $err_code=0; if (($c_start<1) || ($c_end<$c_start)) {$err_code=1;} if (($dir==1) && ($_[1]!=2)) { my $aux=$c_start;$c_start=$c_end;$c_end=$aux} if ($c_res=($aux_short_feature=~/\:/)) {$err_code=2} return ($c_start,$c_end, $dir,$err_code) } sub get_feature_start_end2 #($short_feature); return (start, end, err_code) #SPL new subroutine: extracting feature start and end #SPL no cross-reference allowed (no check) #SPL assuming spaces and new lines signes already deleted {my $aux_short_feature=$_[0]; my $c_start=0; my $c_end=0; my $item_num=($aux_short_feature =~s/(\d+\.\.[<>]?\d+)/$1/g); # print "feature>>>\n"; # print "$aux_short_feature"; # print "<<]?(\d+)/g)) {$i++; if ($i==1) {$c_start=$1} if ($i==$item_num) {$c_end=$2} } my $err_code=0; if (($c_start<1) || ($c_end<$c_start)) {$err_code=1;} return ($c_start,$c_end, $err_code) } # --------- sub init_gene_code { $codon_tab ='TTT#TTC#TTA#TTG#TCT#TCC#TCA#TCG#TAT#TAC#TAA#TAG#TGT#TGC#TGA#TGG#'; $codon_tab.='CTT#CTC#CTA#CTG#CCT#CCC#CCA#CCG#CAT#CAC#CAA#CAG#CGT#CGC#CGA#CGG#'; $codon_tab.='ATT#ATC#ATA#ATG#ACT#ACC#ACA#ACG#AAT#AAC#AAA#AAG#AGT#AGC#AGA#AGG#'; $codon_tab.='GTT#GTC#GTA#GTG#GCT#GCC#GCA#GCG#GAT#GAC#GAA#GAG#GGT#GGC#GGA#GGG#'; $gene_code ='FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'; # # # # } # my $c_pept=''; # my $aux_code=&translate1(substr($CDS_sq,$codon_start-1)); # if ($aux_code!=0) {$err_code=11} #err code # $aux_code=&init_codon(substr($CDS_sq,$codon_start-1,3)); # if ($aux_code!=0) {$warn_code.='1'} else {$warn_code.='0'} # $aux_code=&check_trunc_translate($c_pept); # print 'tran stop=',$aux_code,$eol_str; # if ($err_code==0) # {#print 'pept_len=',length($c_pept),' pept=',$c_pept,$eol_str; #-------------- sub translate1 #(c_cds); return $tran_err {my $tran_err=0; my $c_ind; my $c_pres; my $c_cds=$_[0]; # print $c_cds,"\n"; $translation=''; for (my $i=0; $i=0) {$c_ind=$c_ind /4; $c_pres=substr($gene_code,$c_ind,1); } else {$tran_err=1; $c_pres='X'; &trace("\$i=$i ".substr($c_cds,$i,3),4);} $translation.=$c_pres; } return $tran_err; } #-------------- sub check_trunc_translate # return $check_pept {my $check_err=0; my $c_ind; if (substr($translation,length($translation)-1,1) eq '*') {chop $translation} $check_err=($translation=~tr/*//); return $check_err; } #-------------- sub init_codon #(c_cds); return $c_pept {my $init_err=0; my $c_cds=$_[0]; # print $c_cds,' ',substr($c_cds,0,3),"\n"; if (substr($c_cds,0,3) ne 'ATG') {$init_err=1} return $init_err; } #-------------- sub expand_rec_1nuc #(c_rec) { my $c_rec=$_[0]; my $c_res=($c_rec=~s/([^\0560-9]{1})(\d+),/$1$2\.\.$2,/g); $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) # ignore the case of single nucleotide only return $c_rec; } #-------------- sub expand_rec_1nuc2 #(c_rec) # example: join(1..20,55) -> join(1..20,55..55) { my $c_rec=$_[0]; my $c_res=($c_rec=~s/([^\0560-9<>]{1})(\d+),/$1$2\.\.$2,/g); $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) # ignore the case of single nucleotide only $c_res=($c_rec=~s/([^\0560-9<>]{1})(\d+)\)/$1$2\.\.$2\)/g); # $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) return $c_rec; } #-------------- sub trace #(c_message, line_indent) {my $auxs; if ($_[1]>=0) {$auxs=' 'x$_[1];} print TRACEF $auxs,$_[0],"\n"; } #-------------- sub trace_stat #(c_message, line_indent) {my $auxs; if ($_[1]>=0) {$auxs=' 'x$_[1];} print STATF $auxs,$_[0],"\n"; } #-------------- sub handle_last_gene_info # push all about last gene in EID { if ($all_CDS_with_stop_codons_flg==1) { $number_of_gene_with_stop_codons++; } #end if all CDS with stop codons if ($some_CDS_with_invalid_codons_flg==1) { $number_of_gene_with_invalid_codons++; } # end if some CDS with invalid codon $total_intron_num_gene1=$total_intron_num_gene1+$c_in_gene_intron_c_max; #a20 $total_exon_num_gene1=$total_exon_num_gene1+$c_in_gene_exon_c_max; #a20 $total_nonc_intron_num_gene1=$total_nonc_intron_num_gene1+$c_in_gene_nonc_intron_c_max; #a20 $total_nonc_atac_intron_num_gene1=$total_nonc_atac_intron_num_gene1+$c_in_gene_nonc_atac_intron_c_max; #a20 $total_eshort_intron_num_gene1+=$c_in_gene_eshort_intron_c_max; #a21 $total_undef_intron_num_gene1+=$c_in_gene_undef_intron_c_max; #a21 $total_elong_intron_num_gene1+=$c_in_gene_elong_intron_c_max; #a21 } #-------------- small lib ---- #-------------- sub print_cds_line #(c_line) {my $c_line=$_[0]; for (my $j=0;$j<=(length($c_line));$j++) {if (ord(substr($c_line,$j,1))!=13) {print substr($c_line,$j,1)}} } #-------------- sub chop_if_chr #(string;chr) {my $loc_string=$_[0]; #print 'char=',ord($_[1]),' string=',$loc_string,' substr=',ord(substr ($loc_string,-1,1)); if (substr ($loc_string,-1,1) eq $_[1]) {chop($loc_string);} #print 'after last=',ord(substr ($loc_string,-1,1)); return $loc_string; } #-------------- sub del_trail_spaces #($loc_l) {$_=$_[0]; s/ +$//;$_=$_} #deleting trailing spaces #-------------- sub purge_eol #($loc_l) {my $loc_l=$_[0]; chomp($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); #print 'chopic: ',$loc_l,' ',length($loc_l); } #-------------- sub purge_eol_sp #($loc_l) {my $loc_l=$_[0]; chomp($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); $loc_l=&del_trail_spaces($loc_l); } #-------------- sub press_any_key { read STDIN,my $a_key,1; $a_key='';} #---- end of lib ----- #----------------------------------------------------------- #SPL REVISION HISTORY #SPL ver 1.06 12.10.04 first working ver with new output form #SPL ver 1.07 13.10.04 tracing data #SPL ver 1.08 20.10.04 new CDS numbering system, tracing #SPL ver 1.09 21.10.04 intron count #SPL ver 1.10 22.10.04 minor statistics, minor debug #SPL ver 1.11 01.11.04 handle CDS exon of len=1 #SPL ver 1.12 08.11.04 minor modifications #SPL ver 1.13 11.11.04 intron count debug # last p_exon modified # entries with stop codons discarded # remove 0A from the first line of dEID #SPL ver 1.14 15.11.04 entries with stop codons marked by STOP_CODON # special counts for introns #SPL ver 1.15 21.11.04 debug get_gene_mRNA_cds # search mRNA key #SPL ver 1.16 26.11.04 reading gziped SQ file b35 like hs_ref_chrx.gbk.gz #SPL ver 1.17 28.11.04 #SPL ver 1.18 02.12.04 flag UTR_AMBI #SPL ver 1.19 11.12.04 flag UTR_NF; debug translate extra nucleotides (mod3) #SPL ver 1.20 17.12.04 more statistics to no protein_id #SPL ver 1.21 23.12.04 minor changes (mainly > sign in last figure) #SPL ver 1.22 23.12.04 <> to CDS incomplete, no protein_id+ #SPL ver 1.23 15.01.05 out trace old version tran error statistics #SPL ver 1.24 20.08.05 statistics file inroduced #SPL ver 1.25 15.09.05 PSEUDO #SPL ver 1.26 18.09.05 "locus_tag=" handling #SPL ver 1.27 29.10.05 GB149 cross-reference handle #SPL ver 1.28 07.11.05 working, undefined intron (..) excluded # from short intron statistics, # PSEUDO tag to pEID file, # CDS_incomplete tag not on separate line but # in first line #SPL ver 1.29 09.11.05 small correction #SPL ver 1.30 09.11.05 cross-ref to unexistent GB record #SPL ver 1.31 23.11.05 corrected get_feature_start_end # return err_code=2 if cross-reference #SPL ver 1.32 29.12.05 corrected make_cds_entry_GB149, mark dec2912, # probably no influence #SPL ver 1.33 02.01.06 expand_rec_1nuc2 #SPL ver 1.34 08.04.07 <> to mRNA incomplete, mRNA_incomplete tag #SPL ver 1.35 26.07.07 damping invalid format in EMBL formatted to GenBank # in line /gene=gene_name # in gene record, that was made without double quotation # which leads to empty gene_name # mark is EMBL_GB # Uppercase nucleotide converted to lowrcase # otherwise improper function, only tcagn allowed, u->t #--------------------------------------------------------------------------