#!/usr/bin/perl -w # Serge Saxonov, Iraj Daizadeh, Alexei Fedorov and Walter Gilbert # Department of Molecular and Cellular Biology # Harvard University # Cambridge, MA 02138 # Reference: Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert # Nucleic Acids Res. 2000 Jan 1;28(1):185-190 #SPL spl notes #SPL spl reduction in Sept 2004 #SPL ver 1.03 20.10.04 #SPL ver 1.04 21.10.04 #SPL ver 1.05 22.10.04 #SPL ver 1.06 01.11.04 #SPL ver 1.07 07.11.04 #SPL ver 1.08 12.11.04 #SPL ver 1.09 17.11.04 #SPL ver 1.10 19.11.04 #SPL ver 1.11 19.11.04 #SPL ver 1.12 02.12.04 #SPL ver 1.13 11.12.04 #SPL ver 1.14 16.12.04 #SPL ver 1.15 17.12.04 #SPL ver 1.16 17.12.04 #SPL ver 1.17 15.01.05 #SPL ver 1.18 19.08.05 #SPL ver 1.19 04.09.05 #SPL ver 1.20 11.09.05 #SPL ver 1.21 13.09.05 #SPL ver 1.22 15.09.05 #SPL ver 1.23 18.09.05 #SPL ver 1.24 25.10.05 #SPL ver 1.25 01.11.05 #SPL ver 1.26 08.11.05 #SPL ver 1.27 18.11.05 #SPL ver 1.28 25.12.05 #SPL ver 1.29 26.12.05 #SPL ver 1.30 28.12.05 #SPL ver 1.31 01.01.06 #SPL ver 1.32 02.01.06 #SPL ver 1.33 05.01.06 #SPL ver 1.34 29.06.06 #SPL ver 1.35 11.07.06 #SPL ver 1.36 19.07.06 #SPL ver 1.37 23.08.06 #SPL ver 1.37e 01.09.06 #SPL ver 1.38e 01.09.06 #SPL ver 1.39e 01.09.06 #SPL ver 1.40e 13.09.06 #SPL ver 1.41e 21.09.06 #SPL ver 1.42e 21.09.06 #SPL ver 1.43e 21.09.06 #SPL ver 1.44e 21.09.06 #SPL ver 1.45e 21.09.06 #SPL ver 1.46e 07.10.06 #SPL ver 1.47e 18.10.06 #SPL ver 1.48e 19.10.06 #SPL ver 1.49e 20.10.06 #SPL ver 1.50e 20.10.06 #SPL ver 1.51e 20.10.06 #SPL ver 1.52e 20.10.06 #SPL ver 1.53e 08.04.07 #SPL ver 1.54e 18.07.07 #SPL ver 1.55e 26.07.07 #SPL ver 1.55ee 05.08.07 ## check_translations.pl -- The program to parse the files produced by first_parse.pl ## check that the translations are present and correspond in ## length to that specified in the CDS join feature. For each ## CDS join, it outputs the information about the locus and the ## CDS join in a separate entry in $prefix.RAWchecked. STDIN ## is used for input. my $c_ver='1.56e'; my $txt_file_style=1;# =0 DOS-like style; =1 UNIX-like style #SPL my $eol_str; #SPL my @gene_blocks; my @CDS_blocks; my $coding_gene_flg=0; #a19 my $intronless_c=0; #SPL counter of intronless CDS; if ($txt_file_style==0) {$eol_str="\r\n";} else {$eol_str="\n";} #SPL #&test_expand_1nucl; #&test_diff_num; #exit; exit; ($#ARGV != -1) || die "Usage: check_translations.pl prefix\n"; $prefix = $ARGV[0]; $modeCDS= $ARGV[1]; open(OUT, ">$prefix.RAC") || die;#SPL ORI open(OUT, ">$prefix.RAWchecked") || die; open(INP_F,"$prefix.RAW") || die "unable to open $prefix.RAW"; #SPL open(TRACEF, ">>${prefix}.tEID") || die "unable open ${prefix}.tEID"; open(STATF, ">>${prefix}.sEID") || die "unable open ${prefix}.sEID"; #a19 &trace("modeCDS=$modeCDS",0); #SPL $real_cds_counter = 0; $cds_counter = 0; $cds_feature_counter = 0; $gene_counter = 0; #SPL new counter due to new spec $coding_gene_counter=0; #a19 $OK_gene_counter = 0; #SPL new counter due to new spec, passed verification $non_alt_gene_counter=0; #SPL counter of genes without alternative splicing; $alt_gene_counter=0; #SPL counter of genes with alternative splicing; $alt_isoforms_c=0; #a19 total number of alternatively spliced isoforms $mRNA_match_controvers_c=0; #SPL counter of contraversal mRNA assignment events $skipped_cds = 0; $mRNA_assign_err_c=0; #number of CDS with no mRNA assignment $CDS_format_error_c=0; $null_CDS_gene_counter=0; $mRNA_match_controvers_gene_c=0; $err_get_mRNA_CDS_c=0; $aux_c1=0; $aux_c3=0; $aux_c4=0; my $aux_gene_name='???'; my $gene_counter_in_contig=0; my $locus_overlap_c=0; my $locus_overlap_c_in_genes=0; my $locus_overlapped_c=0; #a19 numer of genes paticipating in overlap my $CDS_non_overlap_event_c=0; my $CDS_non_overlap_c=0; my $CDS_gene_name_mismatch_c=0; my $CDS_gene_name_mismatch_locus_c=0; my $c_CDS=0; my $in_gene_CDS_gene_name_mismatch_c=0; my $dismissed_alt_gene_c=0; my $total_improvements_in_second_round=0; my $final_improvements_in_second_round=0; my $number_fails_in_second_round=0; my $alt_mRNA_while_CDS_intronless_c=0; my $spliced_mRNA_while_CDS_intronless_c1=0; my $all_mRNA_equ_c=0; my $prot_id=''; my $no_protein_id_check_phase=0; my $CDS_exon_num_err_c=0; # number of errors in calculating CDS_exon_num my $CDS_exon_num_equ1_err_c=0; # same when erroneous value is 1 my $aux_CDS_exon_num_diff_c=0; my $CDS_exon_num_equ21_err_defpid_c=0; my $CDS_exon_num_equ21_err_nopid_c=0; my $CDS_exon_num_err_defpid_c=0; my $CDS_exon_num_err_nopid_c=0; my $corr_intronless_c=0; my $spliced_mRNA_while_CDS_intronless_defpid_c1=0; my $corr_intronless_defpid_c=0; my $CDS_exon_num_equ0_c=0; my $spliced_mRNA_avail_flg=0; #a19 only declaration my $deb_c=0; my $CDS_pseudo_c=0; my $corr_intronless_CDS_pseudo_c=0; my $corr_spliced_CDS_pseudo_c=0; my $corr_intronless_CDS_pseudo_defpid_c=0; my $corr_spliced_CDS_pseudo_defpid_c=0; my $pseudo_in_alt_gene_c=0; my $pseudo_flg=0; my $rejected_mRNA_recs_due_to_err=0; # declaration of alternative exons counts part my $total_alt_mRNA_gene_c=0; my $total_OK_mRNA_in_alt_mRNA_gene_c=0; my $total_irremovable_exons_c=0; my $total_alternative_exons_c=0; my $total_diff_alt_c=0; my $total_skipped_exon_c=0; my $total_skipped_internal=0; my $total_trunc_class_num=0; my $total_others_num=0; my $total_retained_intron_c=0; my $total_retained_intron_in_trunc_c=0; my $total_retained_intron_others_c=0; my $total_retained_intron_others_cc=0; $/ = ""; while($cur_entry = ){ ($id) = ($cur_entry =~ /\nLOCUS\s+(\S+)/); #$deb_c++; #print 'deb_c=',$deb_c,$eol_str;&press_any_key; #print $cur_entry;&press_any_key; #print 'id:>>>',$id,'<<<';&press_any_key; (defined($id)) || die "Couldn't resolve the LOCUS line in $cur_entry\n"; &do_checks2($id, $cur_entry); } &trace("total number of genes=$gene_counter",0); #SPL &trace("total number of OK genes=$OK_gene_counter",0); &trace("total number of genes without alt splicing=$non_alt_gene_counter",0); &trace("total number of genes with alt splicing=$alt_gene_counter",0); if ($dismissed_alt_gene_c>=0) { &trace("total number of locus overlap events =$locus_overlap_c",0); &trace("total number of locus overlapped paires =$locus_overlap_c_in_genes",0); &trace("total number of nonoverlapped CDS events =$CDS_non_overlap_event_c",0); &trace("total number of locuses with nonoverlapped CDS=$CDS_non_overlap_c",0); &trace("total number of events of CDS gene name mismatch=$CDS_gene_name_mismatch_c",0); &trace("total number of locuses with CDS gene name mismatch=$CDS_gene_name_mismatch_locus_c",0); &trace("total number of dismissed alt splice gene name counter=$dismissed_alt_gene_c",0); my $aux_cc=$alt_gene_counter-$dismissed_alt_gene_c; &trace("corrected total number of genes with alt splicing=$aux_cc",0); } #SPL if ($modeCDS ne 'nowi') {&trace("total number of controversial mRNA assignment=$mRNA_match_controvers_c",0); #SPL &trace("total number of genes with controversial mRNA assignment=$mRNA_match_controvers_gene_c",0); #SPL &trace("total_improvements_in_second_round=$total_improvements_in_second_round",0);#SPL &trace("final_improvements_in_second_round=$final_improvements_in_second_round",0);#SPL &trace("number of fails in second round=$number_fails_in_second_round",0);#SPL &trace("number of events when all controversial mRNA assign equ=$all_mRNA_equ_c",0); } &trace("total number of CDS features=$cds_feature_counter",0); #SPL &trace("total number of CDS to store=$cds_counter",0); #SPL #no further &trace("number of intronless CDS=$intronless_c",0); #SPL &trace("corr_intronless_defpid_c=$corr_intronless_defpid_c",0); &trace("number of spliced mRNA for intronless CDS=$alt_mRNA_while_CDS_intronless_c",0);#SPL &trace("number of intronless CDS while spliced mRNA available=$spliced_mRNA_while_CDS_intronless_c1",0); &trace("spliced_mRNA_while_CDS_intronless_defpid_c1=$spliced_mRNA_while_CDS_intronless_defpid_c1",0); &trace("number of mRNA assignment errors=$mRNA_assign_err_c",0); #SPL &trace("CDS format errors=$CDS_format_error_c",0); #SPL &trace("no_protein_id_total=$no_protein_id_check_phase",0); &trace("CDS_exon_num_err_c=$CDS_exon_num_err_c",0); &trace("CDS_exon_num_equ1_err_c=$CDS_exon_num_equ1_err_c",0); &trace("aux_CDS_exon_num_diff_c=$aux_CDS_exon_num_diff_c",0); &trace("CDS_exon_num_equ21_err_defpid_c=$CDS_exon_num_equ21_err_defpid_c",0); &trace("CDS_exon_num_equ21_err_nopid_c=$CDS_exon_num_equ21_err_nopid_c",0); &trace("CDS_exon_num_err_defpid_c=$CDS_exon_num_err_defpid_c",0); &trace("CDS_exon_num_err_nopid_c=$CDS_exon_num_err_nopid_c",0); &trace("corr_intronless_c=$corr_intronless_c",0); &trace("CDS_exon_num_equ0_c=$CDS_exon_num_equ0_c",0); &trace("null_CDS_gene_counter=$null_CDS_gene_counter",4); &trace("err_get_mRNA_CDS_c=$err_get_mRNA_CDS_c",4); &trace("number of CDS with two or more exons: ".$aux_c1,4); &trace("aux err assignment: ".$aux_c3,4); &trace("!!! err : ".$aux_c4,4); &trace("CDS with pseudo tag: ".$CDS_pseudo_c,0); &trace("spliced CDS with pseudo tag:".$corr_spliced_CDS_pseudo_c,0); &trace("intronless_CDS with pseudo tag:".$corr_intronless_CDS_pseudo_c,0); &trace("spliced CDS with pseudo tag defined protein_id:".$corr_spliced_CDS_pseudo_defpid_c,0); &trace("intronless_CDS with pseudo tag defined protein_id:".$corr_intronless_CDS_pseudo_defpid_c,0); &trace("pseudo gene in alternatively spliced gene:".$pseudo_in_alt_gene_c,0); &trace("rejected mRNA records due to format errors:".$rejected_mRNA_recs_due_to_err,0); &trace(" --- alternative mRNA exons count ---",0); &trace("total number of genes with alternative mRNA................... ".$total_alt_mRNA_gene_c,0); &trace("total number of correct mRNA in genes with alternative mRNA... ".$total_OK_mRNA_in_alt_mRNA_gene_c,0); &trace("total number of constitutive exons............................ ".$total_irremovable_exons_c,0); &trace("total number of alternative exons in all mRNAs................ ".$total_alternative_exons_c,0); &trace("total number of different alternative exons................... ".$total_diff_alt_c,0); &trace("total number of skipped exons................................. ".$total_skipped_exon_c,0); &trace("total number of skipped-internal exons........................ ".$total_skipped_internal,0); &trace("total number of islands with truncated exons.................. ".$total_trunc_class_num,0); &trace("total number of other islands................................. ".$total_others_num,0); &trace("total number of islands with retained introns................. ".$total_retained_intron_c,0); &trace("total number of islands with retained introns and truncated",0); &trace(" exons......................................... ".$total_retained_intron_in_trunc_c,0); &trace("total number of other islands with retained introns........... ".$total_retained_intron_others_c,0); &trace("total number of retained introns other islands................ ".$total_retained_intron_others_cc,0); &trace("version $c_ver",0); close TRACEF; #SPL # STAT file output #a19 &trace_stat("STATISTICS OF GENOMIC EID",10); #a19 &trace_stat("",0); #a19 &trace_stat("A.General:",0); #a19 &trace_stat("Total number of Gene blocks in GenBank........................ $gene_counter",0); #a19 &trace_stat("Total number of protein coding genes.......................... $coding_gene_counter",0); #a19 &trace_stat("Total number of protein coding genes having introns within",0); #a19 &trace_stat(" CDS region.............................................. $OK_gene_counter",0); #a19 &trace_stat("Total number of genes without alt splicing.................... $non_alt_gene_counter",0); #SPL &trace_stat("Total number of genes with alternative splicing............... $alt_gene_counter",0); #a19 &trace_stat("Total number of alternatively spliced isoforms................ $alt_isoforms_c",0); #a19 &trace_stat("Total number of overlapped protein coding genes in EID........ $locus_overlapped_c",0); #a19 #&trace("Total number of intronless protein-coding genes =$locus_overlap_c_in_genes",0); #SPL &trace_stat(" --- alternative mRNA exons count ---",0); &trace_stat("total number of genes with alternative mRNA................... ".$total_alt_mRNA_gene_c,0); &trace_stat("total number of correct mRNA in genes with alternative mRNA... ".$total_OK_mRNA_in_alt_mRNA_gene_c,0); &trace_stat("total number of constitutive exons............................ ".$total_irremovable_exons_c,0); &trace_stat("total number of alternative exons in all mRNAs................ ".$total_alternative_exons_c,0); &trace_stat("total number of different alternative exons................... ".$total_diff_alt_c,0); &trace_stat("total number of skipped exons................................. ".$total_skipped_exon_c,0); &trace_stat("total number of skipped-internal exons........................ ".$total_skipped_internal,0); &trace_stat("total number of islands with truncated exons.................. ".$total_trunc_class_num,0); &trace_stat("total number of other islands................................. ".$total_others_num,0); &trace_stat("total number of islands with retained introns................. ".$total_retained_intron_c,0); &trace_stat("total number of islands with retained introns and truncated",0); &trace_stat(" exons......................................... ".$total_retained_intron_in_trunc_c,0); &trace_stat("total number of other islands with retained introns........... ".$total_retained_intron_others_c,0); &trace_stat("total number of retained introns other islands................ ".$total_retained_intron_others_cc,0); close STATF; #a19 #print 'before exit',$eol_str; &press_any_key; #&press_any_key; exit; #SPL sub do_checks{ my $output = ""; my @genes; my $header; @entrylines = split(/\n/, $cur_entry); ### get the header info for($k = 1; ;$k++){ $header .= "$entrylines[$k]\n"; if($entrylines[$k] =~ /^ source /){ last; } } while(1){ $k++; if($entrylines[$k] !~ /^ /){ last; } else{ $header .= "$entrylines[$k]\n"; } } ### finished getting the header info #SPL entire header including only first source field $i = 2; ## this will keep track of where in @entrylines we currently are CDS:while(&get_cds){ #&print_cds_line; #SPL #print "$cdsline"; #SPL #print "$cds_info"; #SPL $output = &check_cds($cds_info); if($output){ push(@genes, $output); } } foreach $gene (@genes){ ++$cds_counter; print OUT "\$ID $cds_counter\n"; print OUT "${header}_HEADER_END_"; print OUT "$gene\n\n"; } } sub do_checks2{ #SPL new procedure making RAWchecked file #SPL consisting the block of gene, all mRNA of the gene, #SPL and one CDS of that gene #SPL no translation, no mutual mRNA and CDS check #SPL since no sequence availble now due to program chart my $output = ""; my @genes; my $header=''; #s11 my $cds_c_in_gene=0; #SPL numbering of the CDS in gene my $CDS_c_str=''; #SPL letter numbering of the CDS in gene my $err_code=0; my $c_locus_flg=0; @entrylines = split(/\n/, $cur_entry); ### get the header info &trace($entrylines[0],0); &trace($entrylines[1],0); # print $entrylines[0], $eol_str; &press_any_key; # print $entrylines[1], $eol_str; &press_any_key; # print $#entrylines, $eol_str; &press_any_key; for($k = 1; $k<=$#entrylines;$k++){ # print 'k=',$k, $eol_str; &press_any_key; $header .= "$entrylines[$k]\n"; if($entrylines[$k] =~ /^ source /){ last; } } while($k <= $#entrylines){ $k++; if($entrylines[$k] !~ /^ /){ last; } else{ $header .= "$entrylines[$k]\n"; } } ### finished getting the header info #SPL entire header including only first source field $i = 2; ## this will keep track of where in @entrylines we currently are @gene_blocks=(); &get_gene_block; #&print_cds_line; #SPL # &print_cds_line($gene_block); #SPL # &press_any_key; #SPL # print $header,"\n"; # print 'gene_block size=',$#gene_blocks; #&press_any_key;#SPL # @gene_blocks are going separately for each contig; # &press_any_key; $gene_counter_in_contig=0; $old_gene_num=0; $old_gene_num3=0; #for CDS comparison in single gene in ..CDS02 foreach $gene_block (@gene_blocks){ # print $gene_block,"\n"; &press_any_key; $c_locus_flg=0; $in_gene_CDS_gene_name_mismatch_c=0; @CDS_blocks=(); my $op_res=($gene_block=~/\/(gene|locus_tag)=\"([\040-\176]+)\"/); if ($op_res) {$aux_gene_name=$2} #EMBL_GB else {$op_res=($gene_block=~/\/(gene|locus_tag)=\s*([\041-\176]+)/); if ($op_res) {$aux_gene_name=$2} } $coding_gene_flg=0; #a19 $pseudo_flg=0; #s15 if ($modeCDS eq 'nowi') {&get_mRNA_CDS0} # else {&get_mRNA_CDS} elsif ($modeCDS eq 'gb149') {&get_mRNA_CDS_GB149} else {&get_mRNA_CDS_GB149} # print ">>>\n"; # print $gene_block,"\n"; # print "<<<"; # &press_any_key; $cds_c_in_gene=0; $CDS_c_str=''; $gene_counter++; $gene_counter_in_contig++; if ($coding_gene_flg==1) {$coding_gene_counter++} #a19 if ($#CDS_blocks>=0) { $OK_gene_counter++; # print ">>>>$#CDS_blocks<<<<\n"; &press_any_key;#SPL debug if ($#CDS_blocks==0) {$non_alt_gene_counter++} elsif ($#CDS_blocks>0){$alt_gene_counter++; #SPL now count only CDS which has corresponding mRNA $alt_isoforms_c=$alt_isoforms_c+$#CDS_blocks+1; #a19 if ($pseudo_flg==1) {$pseudo_in_alt_gene_c++} #s15 } else {$null_CDS_gene_counter++} } else {$err_get_mRNA_CDS_c++} if ($#CDS_blocks>=0) { $c_CDS=0; foreach $CDS_block (@CDS_blocks){ $c_CDS++; # if ($aux_gene_name eq 'GAGE1') # {print $CDS_block,"\n"; &press_any_key;} ++$cds_counter; $CDS_c_str=$OK_gene_counter; #previous $gene_counter; if ($#CDS_blocks>0) {$CDS_c_str.=&get_CDS_c_str(++$cds_c_in_gene); # print $#CDS_blocks,' ',$cds_c_in_gene," $CDS_c_str\n"; } # check CDS over contig , check CDS gene names; can be encapsulated $dismissed_alt_gene_c=-1; #dummy param to out some statistics out_for_GB149_251205 #out_for_GB149_251205 if ($modeCDS ne 'gb149') {&scan_entire_contig_vs_CDS;} #deb 16.11.04 #out_for_GB149_251205 if ($modeCDS ne 'gb149') {$c_locus_flg=&check_CDS_gene_names($c_locus_flg); } print OUT "\$ID $CDS_c_str\n"; # print OUT "\$ID $cds_counter\n"; print OUT "${header}_HEADER_END_\n"; print OUT "$CDS_block\n\n"; } if ($#CDS_blocks>0) { if (($#CDS_blocks-$in_gene_CDS_gene_name_mismatch_c) <=0) {$dismissed_alt_gene_c++; &trace("dismissed alt splice gene $aux_gene_name",4); } } } #end if } } #sub convert_CDS_block_EMBLGB_GB #($CDS_block) # {my $c_E_mRNA_rec=''; # my $c_E_CDS_rec=''; # my $c_E CDS_block=@_[0]; # $c_E CDS_block=~/\015//g; # my @c_E_CDS_lines=split '\n',$c_E CDS_block; # # } sub get_cds{ my $line; undef($codon_start); while(++$i <= $#entrylines){ if($entrylines[$i] =~ /CDS\s+join|CDS\s+complement\(join/){ #) $real_cds_counter++; $cds_info = $entrylines[$i]; $parenind = ($entrylines[$i] =~ tr/\(/\(/) - ($entrylines[$i] =~ tr/\)/\)/); $cdsline = $entrylines[$i]; while($parenind != 0){ $line = $entrylines[++$i]; $cds_info .= "\n$line"; $parenind += ($line =~ tr/\(/\(/); $parenind -= ($line =~ tr/\)/\)/); $cdsline .= $line; } $codon_start = 0; while(++$i <= $#entrylines){ $line = $entrylines[$i]; #### ($line =~ /^\s*$/) && next; $cds_info .= "\n$line"; if($line =~ /^ \/codon_start=(\d)/){ $codon_start = $1 - 1; } elsif($line =~ /^ \/translation=/){ $translation = $line; $translation =~ s/\/translation=\"//; while($translation !~ /\"/){ $line = $entrylines[++$i]; $cds_info .= "\n$line"; $translation .= $line; } $translation =~ s/\s+//g; $translation =~ s/\"//; if(!defined($codon_start)){ die "No codon_start before translation:$id\n$cdsline\n"; } return 1; } elsif(($line !~ /^\s*$/) && ($line !~ /^ /)){ ## Some entries don't have translations. $cds_info =~ s/\n.*$//; # $codon_start = 0; ## So we just make up dummy values $translation = ""; ## for $translation and $codon_start. $i--; return 1; ## This will be caught later, when we ## check for length of $translation. } } $skipped_cds++; return 0; } } return 0; } sub get_gene_block{ #SPL new subroutine to extract block of lines belonging to one gene feature my $line; my $gene_flg=0; #SPL 1 = gene block available my $gene_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines undef($codon_start); $gene_block=''; while(++$i <= $#entrylines){ if ($entrylines[$i] =~ /^\s{5}mRNA\s{1}|^\s{5}CDS\s{1}/){ $gene_line_flg=1; $gene_flg=1;} elsif ($entrylines[$i] =~ /^\s{5}gene\s{1}/){ $gene_line_flg=1; if ($gene_flg==1) {push @gene_blocks,$gene_block; $gene_block=''; } $gene_flg=1; } #SPL end if elsif ($entrylines[$i] =~ /^\s{5}\w+/) {$gene_line_flg=0} if (($gene_line_flg==1) && ($gene_flg==1)) {$gene_block.="$entrylines[$i]\n"} }#SPL end while if ($gene_flg==1) {push @gene_blocks,$gene_block} # print 'gene_flg=',$gene_flg,' in sub gene_blocks_size=',$#gene_blocks; # &press_any_key; return 0; } sub get_mRNA_CDS{ #SPL new subroutine to extract block of lines belonging to one CDS feature #SPL including gene feature and all mRNA feature my @temp_g_b=split("\n",$gene_block); #print '$#temp_g_b=',$#temp_g_b; #&press_any_key; my $cds_flg=0; #SPL 1 = cds block available my $gene_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_block=''; my $CDS_flg=0; my $mRNA_flg=0; my $mRNA_line_flg=0; my $gene_rec_end="_GENE_REC_END_\n"; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $aux_str; my $i=0; my $mRNA_assing_controvers_flg=0; my $gene_block=''; $mRNA_block=''; # first gather gene feaure and all mRNA features while($i<= $#temp_g_b){ # print $temp_g_b[$i],"\n"; # &press_any_key; if ($temp_g_b[$i] =~ /^\s{5}gene\s{1}/){ $gene_line_flg=1} elsif ($temp_g_b[$i] =~ /^\s{5}mRNA\s{1}/) { $mRNA_line_flg=1;$gene_line_flg=0} elsif ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/) {$gene_line_flg=0; $mRNA_line_flg=0} if ($gene_line_flg==1) {$gene_block.="$temp_g_b[$i]\n"} if ($mRNA_line_flg==1) {$mRNA_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while # second prepare long and short mRNA description stack my @temp_m_b=split("\n",$mRNA_block); # print 'check $#temp_m_b=',$#temp_m_b; # &press_any_key; my $mRNA_rec=''; @long_mRNA_stack=(); @mRNA_stack=(); $i=0; while($i<= $#temp_m_b){ # print $temp_m_b[$i],"\n"; # &press_any_key; if ($temp_m_b[$i] =~ /^\s{5}mRNA\s{1}/){ $mRNA_line_flg=1; if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec); } $mRNA_flg=1; $mRNA_rec=$temp_m_b[$i]."\n"; } else {if ($mRNA_line_flg==1) {$mRNA_rec.=$temp_m_b[$i]."\n"}} $i++; }#SPL end while if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec);} # third add CDS feature for every CDS feature $i=0;$CDS_flg=0;$CDS_line_flg=0; $c_CDS_num=0; my $CDS_exon_num=0; my $mRNA_num=0; my $mRNA_match_c=0; my $err_code=0; my $SPL_note_rec=' /SPL_note="mRNA assignment controversial"'; my $aux_rec=''; my $aux_RNA_block=''; while($i<= $#temp_g_b){ if ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/){ $CDS_line_flg=1; $cds_feature_counter++; $coding_gene_flg=1; #a19 if ($CDS_flg==1) { # print '>>',$CDS_block,'<<',$eol_str; &press_any_key;#s13 ($mRNA_num,$CDS_exon_num,$mRNA_match_c,$err_code)= &check_mRNA_block_CDS_block_rel($CDS_block); # print '$mRNA_num=',$mRNA_num,' $CDS_exon_num=',$CDS_exon_num, # ' $mRNA_match_c=',$mRNA_match_c,' $err_code=',$err_code,$eol_str; # &press_any_key; #s13 if ($err_code==0) { if ($CDS_exon_num>1) { $aux_c1++; if ($mRNA_num>0) { $j=0; foreach (@long_mRNA_stack) {$j++; if ($j==$mRNA_num) {if ($mRNA_match_c>1) {$aux_rec=$SPL_note_rec."\n"} else {$aux_rec="\n"} $aux_str=$gene_rec_end.$_.$aux_rec.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; last} } if ($mRNA_match_c>1) {&trace("$mRNA_match_c matched mRNA found",4); &trace($gene_block,0); $mRNA_match_controvers_c++; $mRNA_assing_controvers_flg=1; } } else {&trace("no matched mRNA found",4); $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY mRNA_NF"'."\n"; $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; &trace("mRNA considered equ CDS",4); &trace($gene_block,0); $mRNA_assign_err_c++; } } else {&trace("intronless CDS",4); &trace($gene_block,0) } } else {&trace("CDS format error",4); &trace($gene_block,0); $CDS_format_error_c++;} #a19 #print "$CDS_block\n"; #&press_any_key; $CDS_block=''; } $c_CDS_num++; $CDS_flg=1; } elsif ($temp_g_b[$i] =~ /^\s{5}\w+\s{1}/){ $CDS_line_flg=0} if ($CDS_line_flg==1) {$CDS_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while if ($CDS_flg==1) { # print '>>',$CDS_block,'<<',$eol_str; &press_any_key;#s13 ($mRNA_num,$CDS_exon_num,$mRNA_match_c,$err_code)=&check_mRNA_block_CDS_block_rel($CDS_block); # print '$mRNA_num=',$mRNA_num,' $CDS_exon_num=',$CDS_exon_num, # ' $mRNA_match_c=',$mRNA_match_c,' $err_code=',$err_code,$eol_str; # &press_any_key; #s13 if ($err_code==0) { if ($CDS_exon_num>1) { $aux_c1++; if ($mRNA_num>0) { $j=0; foreach (@long_mRNA_stack) {$j++; if ($j==$mRNA_num) {if ($mRNA_match_c>1) {$aux_rec=$SPL_note_rec."\n"} else {$aux_rec="\n"} $aux_str=$gene_rec_end.$_.$aux_rec.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block;last} } if ($mRNA_match_c>1) {&trace("$mRNA_match_c matched mRNA found",4); &trace($gene_block,0); $mRNA_match_controvers_c++; $mRNA_assing_controvers_flg=1; } } else {&trace("no matched mRNA found",4); $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY mRNA_NF"'."\n"; $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; &trace("mRNA considered equ CDS",4); &trace($gene_block,0); $mRNA_assign_err_c++; } } else {&trace("intronless CDS",4); &trace($gene_block,0) } } else {&trace("CDS format error",4); &trace($gene_block,0); $CDS_format_error_c++} } if ($mRNA_assing_controvers_flg) {$mRNA_match_controvers_gene_c++} if ($err_code!=0) {$aux_c3++} if (($err_code==0) and ($CDS_exon_num>1) and ($mRNA_num>0)) {$err_code=0} else {$err_code=1} #this line has no meaning return $err_code; } sub get_mRNA_CDS0{ #SPL new subroutine to extract block of lines belonging to one CDS feature #SPL including gene feature and all mRNA feature #SPL special version for handling all CDS without wings #SPL produces dummy mRNA record to include into prefix.rac my @temp_g_b=split("\n",$gene_block); #print '$#temp_g_b=',$#temp_g_b; #&press_any_key; my $cds_flg=0; #SPL 1 = cds block available my $gene_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_block=''; my $CDS_flg=0; my $mRNA_flg=0; my $mRNA_line_flg=0; my $gene_rec_end="_GENE_REC_END_\n"; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $aux_str; my $i=0; my $mRNA_assing_controvers_flg=0; my $gene_block=''; $mRNA_block=''; # first gather gene feaure and all mRNA features while($i<= $#temp_g_b){ # print $temp_g_b[$i],"\n"; # &press_any_key; if ($temp_g_b[$i] =~ /^\s{5}gene\s{1}/){ $gene_line_flg=1} elsif ($temp_g_b[$i] =~ /^\s{5}mRNA\s{1}/) { $mRNA_line_flg=1;$gene_line_flg=0} elsif ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/) {$gene_line_flg=0; $mRNA_line_flg=0} if ($gene_line_flg==1) {$gene_block.="$temp_g_b[$i]\n"} if ($mRNA_line_flg==1) {$mRNA_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while # second prepare long and short mRNA description stack my @temp_m_b=split("\n",$mRNA_block); # print 'check $#temp_m_b=',$#temp_m_b; # &press_any_key; my $mRNA_rec=''; @long_mRNA_stack=(); @mRNA_stack=(); $i=0; while($i<= $#temp_m_b){ # print $temp_m_b[$i],"\n"; # &press_any_key; if ($temp_m_b[$i] =~ /^\s{5}mRNA\s{1}/){ $mRNA_line_flg=1; if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec); } $mRNA_flg=1; $mRNA_rec=$temp_m_b[$i]."\n"; } else {if ($mRNA_line_flg==1) {$mRNA_rec.=$temp_m_b[$i]."\n"}} $i++; }#SPL end while if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec);} # third add CDS feature for every CDS feature # replase RNA block by CDS block $i=0;$CDS_flg=0;$CDS_line_flg=0; $c_CDS_num=0; my $CDS_exon_num=0; my $mRNA_num=0; my $mRNA_match_c=0; my $err_code=0; my $aux_RNA_block=''; while($i<= $#temp_g_b){ if ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/){ $CDS_line_flg=1; $cds_feature_counter++; $coding_gene_flg=1; #a19 if ($CDS_flg==1) {$CDS_exon_num=&check_CDS_block($CDS_block); if ($CDS_exon_num>1) { $aux_c1++; $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY"'."\n"; $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; } else {&trace("intronless CDS",4); &trace($gene_block,0) } $CDS_block=''; } $c_CDS_num++; $CDS_flg=1; } elsif ($temp_g_b[$i] =~ /^\s{5}\w+\s{1}/){ $CDS_line_flg=0} if ($CDS_line_flg==1) {$CDS_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while if ($CDS_flg==1) {$CDS_exon_num=&check_CDS_block($CDS_block); if ($CDS_exon_num>1) { $aux_c1++; $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY"'."\n";#corr in 1.13 $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; } else {&trace("intronless CDS",4); &trace($gene_block,0) } } return $err_code; } sub get_mRNA_CDS02 #($c_gene_block) #SPL new subroutine to extract block of lines belonging to one CDS feature #SPL NOT including gene feature and all mRNA feature #SPL special version for handling all CDS without wings #SPL to scan all CDS of the contig {my $c_gene_block=$_[0]; my @temp_g_b=split("\n",$c_gene_block); #print '$#temp_g_b=',$#temp_g_b; #&press_any_key; my $cds_flg=0; #SPL 1 = cds block available my $gene_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_block=''; my $CDS_flg=0; my $mRNA_flg=0; my $mRNA_line_flg=0; my $gene_rec_end="_GENE_REC_END_\n"; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $aux_str; my $i=0; my $mRNA_assing_controvers_flg=0; my $gene_block2=''; $mRNA_block2=''; # first gather gene feaure and all mRNA features while($i<= $#temp_g_b){ # print $temp_g_b[$i],"\n"; # &press_any_key; if ($temp_g_b[$i] =~ /^\s{5}gene\s{1}/){ $gene_line_flg=1} elsif ($temp_g_b[$i] =~ /^\s{5}mRNA\s{1}/) { $mRNA_line_flg=1;$gene_line_flg=0} elsif ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/) {$gene_line_flg=0; $mRNA_line_flg=0} if ($gene_line_flg==1) {$gene_block2.="$temp_g_b[$i]\n"} if ($mRNA_line_flg==1) {$mRNA_block2.="$temp_g_b[$i]\n"} $i++; }#SPL end while # third add CDS feature for every CDS feature # replase RNA block by CDS block $i=0;$CDS_flg=0;$CDS_line_flg=0; $c_CDS_num=0; my $CDS_exon_num=0; my $mRNA_num=0; my $mRNA_match_c=0; my $err_code=0; $aux_CDS_block=''; while($i<= $#temp_g_b){ if ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/){ $CDS_line_flg=1; if ($CDS_flg==1) {#$CDS_exon_num=&check_CDS_block($aux_CDS_block); push @CDS_blocks2,$gene_block2.$mRNA_rec_end.$aux_CDS_block; # print "$aux_CDS_block\n";&press_any_key; $aux_CDS_block=''; } $c_CDS_num++; $CDS_flg=1; } elsif ($temp_g_b[$i] =~ /^\s{5}\w+\s{1}/){ $CDS_line_flg=0} if ($CDS_line_flg==1) {$aux_CDS_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while if ($CDS_flg==1) {# $CDS_exon_num=&check_CDS_block($aux_CDS_block); push @CDS_blocks2,$gene_block2.$mRNA_rec_end.$aux_CDS_block; # print "$aux_CDS_block\n";&press_any_key; } return $err_code; } sub get_mRNA_CDS_GB149{ #SPL new subroutine to extract block of lines belonging to one CDS feature #SPL including gene feature and all mRNA feature my @temp_g_b=split("\n",$gene_block); #print '$#temp_g_b=',$#temp_g_b; #&press_any_key; my $cds_flg=0; #SPL 1 = cds block available my $gene_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_line_flg=0; #SPL 1 = gene, mRNA, or CDS lines my $CDS_block=''; my $CDS_flg=0; my $mRNA_flg=0; my $mRNA_line_flg=0; my $gene_rec_end="_GENE_REC_END_\n"; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $aux_str; my $i=0; my $mRNA_assing_controvers_flg=0; my $gene_block=''; $mRNA_block=''; my $c_i2=0; # first gather gene feature and all mRNA features while($i<= $#temp_g_b){ # print $temp_g_b[$i],"\n"; # &press_any_key; if ($temp_g_b[$i] =~ /^\s{5}gene\s{1}/){ $gene_line_flg=1} elsif ($temp_g_b[$i] =~ /^\s{5}mRNA\s{1}/) { $mRNA_line_flg=1;$gene_line_flg=0} elsif ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/) {$gene_line_flg=0; $mRNA_line_flg=0} if ($gene_line_flg==1) {$gene_block.="$temp_g_b[$i]\n"} if ($mRNA_line_flg==1) {$mRNA_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while # second prepare long and short mRNA description stack my @temp_m_b=split("\n",$mRNA_block); # print 'check $#temp_m_b=',$#temp_m_b; # &press_any_key; my $mRNA_rec=''; @long_mRNA_stack=(); @mRNA_stack=(); $i=0; while($i<= $#temp_m_b){ # print $temp_m_b[$i],"\n"; # &press_any_key; if ($temp_m_b[$i] =~ /^\s{5}mRNA\s{1}/){ $mRNA_line_flg=1; if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec); } $mRNA_flg=1; $mRNA_rec=$temp_m_b[$i]."\n"; } else {if ($mRNA_line_flg==1) {$mRNA_rec.=$temp_m_b[$i]."\n"}} $i++; }#SPL end while if ($mRNA_flg==1) {push @long_mRNA_stack,$mRNA_rec; push @mRNA_stack,&purge_feature_record($mRNA_rec);} # if ($aux_gene_name eq 'RNU12') # print "aux stop\n";&press_any_key; #print 'lmRNA_stack>>',@long_mRNA_stack,'<<',$eol_str; &press_any_key;#exon #print ' mRNA_stack>>',@mRNA_stack,'<<',$eol_str; &press_any_key;#exon # mark mRNA stack @mRNA_stack_errs=(); my $c_mRNA_rec=''; my $c_mRNA_rec_i=0; foreach $c_mRNA_rec (@mRNA_stack) { #print ">>>$c_mRNA_rec <<<";&press_any_key; $c_mRNA_rec=&expand_rec_1nuc2($c_mRNA_rec); my $mRNA_err=&get_feature_start_end4($c_mRNA_rec); push @mRNA_stack_errs,$mRNA_err; if ($mRNA_err) {$rejected_mRNA_recs_due_to_err++; &trace("rejected mRNA rec:",4); &trace($long_mRNA_stack[$c_mRNA_rec_i],0); print "rejected mRNA rec:$long_mRNA_stack[$c_mRNA_rec_i]\n"; } $c_mRNA_rec_i++; } # here calculating alternative exon count my $c_mRNA_ind1=0; my $c_mRNA_ind2=0; my $c_mRNA_rec1=''; my $c_mRNA_rec1_aux=''; my $c_mRNA_rec2=''; my @irremovable_exons=(); my @out_irr_array=(); my @all_exons_arr=(); my @out_all_exons_arr=(); my $aux_rec1=''; my $aux_rec2=''; my @aux_rec1_stack=(); my @aux_rec2_stack=(); my $aux_first_OK_flg=1; # aux to find first OK mRNA rec foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while (my $c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; if ($aux_first_OK_flg==1) {$aux_first_OK_flg=0; @irremovable_exons=@aux_rec1_stack; #print "start>@irremovable_exons@out_all_exons_arr>>$c_mRNA_rec2 <<<";&press_any_key; # print "$c_i2: s4>>>@mRNA_stack2 <<$c_mRNA_ind1) && ($mRNA_stack_errs[$c_mRNA_ind2]==0)) { $c_mRNA_rec2=~s/[\s\n]//g; $c_mRNA_rec2=&expand_rec_1nuc2($c_mRNA_rec2); while (my $c_res2=($c_mRNA_rec2=~m/\(/)) {$c_mRNA_rec2=$';} $c_mRNA_rec2=~m/\)/; $c_mRNA_rec2=$`; @aux_rec2_stack=split/,/, $c_mRNA_rec2; #print "2>@aux_rec2_stack<2\n";&press_any_key; @out_irr_array=(); foreach $c_irr (@irremovable_exons) {foreach $c_exon (@aux_rec2_stack) { #print "$c_irr :: $c_exon",; if ($c_irr eq $c_exon) {#print " ="; push @out_irr_array, $c_irr} # print "\n";&press_any_key; } } # end comparing exons @irremovable_exons=@out_irr_array; #print "c_st1>@out_irr_array@irremovable_exons@irremovable_exons>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $OK_mRNA_c++; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; foreach $c_exon (@aux_rec1_stack) {$aux_irr_exon_flg=0; foreach $c_irr (@irremovable_exons) { #print "$c_irr :: $c_exon",; if ($c_irr eq $c_exon) {#print " ="; $aux_irr_exon_flg=1;} # print "\n";&press_any_key; } if ($aux_irr_exon_flg==0) {$c_alternative_exons_c++;} } # end comparing exons # seems irrelevant position }$c_mRNA_ind1++;} #end foreach1 #print "gene=$aux_gene_name\n"; #print "irr_exon_c=$c_irremovable_exons_c\n"; #print "alt_exon_c0=$c_alternative_exons_c\n"; my $diff_alt_c=$#all_exons_arr+1-$c_irremovable_exons_c; #print "alt_exon_c=$diff_alt_c\n";#&press_any_key; # calculating skipped exons my @rest_exons=(); #@all_exons_arr; foreach $c_exon (@all_exons_arr) {my $aux_exon_flg=0; foreach $c_irr (@irremovable_exons) { if ($c_exon eq $c_irr) {$aux_exon_flg=1}} if ($aux_exon_flg==0) {push @rest_exons,$c_exon} } #end foreach 1 # print "all: @all_exons_arr\n"; #&press_any_key; # print "irr: @irremovable_exons\n";#&press_any_key; # print "rest: @rest_exons\n"; my @skipped_exons=(); my $aux_skip_exon_flg=1; foreach $c_rest (@rest_exons) {$aux_skip_exon_flg=0; $c_mRNA_ind1=0; $c_overlap_flg=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=1; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; foreach $c_exon (@aux_rec1_stack) {#print "$c_rest vs $c_exon\n"; if ($c_rest eq $c_exon) {$aux_c_flg=0;} else {$c_overlap=&get_overlap_exon($c_rest,$c_exon); # print "overlap=$c_overlap\n"; if ($c_overlap!=0) {$aux_c_flg=0;$c_overlap_flg=1} } # end else # print "aux=$aux_c_flg\n"; } # end comparing exons if ($aux_c_flg==1) {$aux_skip_exon_flg=1}; # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant if ($c_overlap_flg==1) {$aux_skip_exon_flg=0}; # clear skip flag # if overlap but not # equality anywhere if ($aux_skip_exon_flg==1) {push @skipped_exons,$c_rest} } #end foreach rest my $skipped_exon_c=$#skipped_exons+1; # print "skipped_exon_c=$skipped_exon_c\n"; # discriminate internal skipped exons and marginal skipped exons # this is improper section where only first marginal exons # but in all RNA is counted # stored temporary for debug my $c_exon_c=0; my $marginal_skipped_exons_c=0; foreach $c_skipped (@skipped_exons) {$aux_skip_exon_flg=0; $c_mRNA_ind1=0; my $marginal_skipped_exon_avail_flg=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=0; #here indicate marginal exon $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; $c_exon_c=0; foreach $c_exon (@aux_rec1_stack) {#print "$c_rest vs $c_exon\n"; if (($c_skipped eq $c_exon) and (($c_exon_c==0) or ($c_exon_c==$#aux_rec1_stack))) {$aux_c_flg=1;} # print "aux=$aux_c_flg\n"; $c_exon_c++; } # end comparing exons if ($aux_c_flg==1) {$marginal_skipped_exon_avail_flg=1}; # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant if ($marginal_skipped_exon_avail_flg==1) {$marginal_skipped_exons_c++} } #end foreach rest # print "marginal_skipped_exon_c=$marginal_skipped_exons_c\n"; # discriminate internal skipped exons and marginal skipped exons # this section searchs for all marginal skipped exon only # in protruding RNA # possibly invalid proc for nonoverlapped RNA? $c_exon_c=0; $marginal_skipped_exons_c=0; #search for left and right protruding mRNA my $left_end=999999999999; my $right_end=0; my $rightmost_left_end=0; my $leftmost_right_end=999999999999; $c_mRNA_ind1=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=0; #here indicate marginal exon $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; $c_exon_c=0; foreach $c_exon (@aux_rec1_stack) {#print "$c_rest vs $c_exon\n"; ($c_start1,$c_end1,$c_err1)=&get_feature_start_end2($c_exon); if ($c_exon_c==0) {if ($c_start1<$left_end) {$left_end=$c_start1} # seems unused later if ($c_start1>$rightmost_left_end) {$rightmost_left_end=$c_start1} } if ($c_exon_c==$#aux_rec1_stack) {if ($c_end1>$right_end) {$right_end=$c_end1} # seems unused later if ($c_end1<$leftmost_right_end) {$leftmost_right_end=$c_end1} } $c_exon_c++; } # end comparing exons } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant # print "$left_end $rightmost_left_end $leftmost_right_end $right_end\n"; my @marginal_skipped_arr=(); # print "skip: @skipped_exons\n"; foreach $c_skipped (@skipped_exons) {$aux_skip_exon_flg=0; $c_mRNA_ind1=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; foreach $c_exon (@aux_rec1_stack) {#print "$c_rest vs $c_exon\n"; if ($c_skipped eq $c_exon) {($c_start1,$c_end1,$c_err1)=&get_feature_start_end2($c_exon); if (($c_start1>$leftmost_right_end) or ($c_end1<$rightmost_left_end)) { push (@marginal_skipped_arr,$c_skipped)} } #if current mRNA exon is skipped } # end comparing exons # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant } #end foreach skipped # print "marginal: @marginal_skipped_arr\n"; $marginal_skipped_exons_c=&num_diff_items(@marginal_skipped_arr); # print "marginal_skipped_exon_c=$marginal_skipped_exons_c\n"; # calculating rest exon by subtracting skipped exons my @out_rest_exons=(); foreach $c_exon (@rest_exons) {my $aux_exon_flg=0; foreach $c_skip (@skipped_exons) { if ($c_exon eq $c_skip) {$aux_exon_flg=1}} if ($aux_exon_flg==0) {push @out_rest_exons,$c_exon} } #end foreach 1 # print "rest: @out_rest_exons\n"; @rest_exons=@out_rest_exons; # # producing nonoverlapped classes # # first produce projection, nonchalance for comparing twice # 050807 this block has a bug, taken under comment, # see beneath another algorithm # my @proj_arr=(); # my @aux_proj_arr=(); # my $c_exon_num=0; # my $no_overlap_flg=1; # my @rewritten_stack=(); # if ($#rest_exons>=0) {$proj_arr[0]=$rest_exons[0]; # @aux_proj_arr=@_proj_arr; # foreach (@rest_exons) # { $c_exon=$_; #HERE MODIFIED ERROR JOINNIG PROC # #BACKWARD INFLUENCE ON @rest_exons OTHERWISE # $c_exon_num=0; $no_overlap_flg=1; # @rewritten_stack=(); # foreach $cc_exon (@proj_arr) # { ($c_overlap,$c_rec)=&get_joint_exon($c_exon,$cc_exon); # print "$c_exon & $cc_exon=>$c_rec ($c_exon_num)\n"; # if ($c_overlap==1) {$aux_proj_arr[$c_exon_num]=$c_rec; # push @rewritten_stack,$c_exon_num; # $no_overlap_flg=0; # } # $c_exon_num++; # } #end foreach 2 # if ($no_overlap_flg==1) {push @aux_proj_arr,$c_exon}; ## print "c_jnt_p: @aux_proj_arr\n"; &press_any_key; # @proj_arr=@aux_proj_arr; # if ($#rewritten_stack>=1) { # @aux_proj_arr=(); # # get common exon # $c_exon=$proj_arr[$rewritten_stack[0]]; # for (my $i=1; $i<=$#rewritten_stack;$i++) # { $cc_exon=$proj_arr[$rewritten_stack[$i]]; # ($c_overlap,$c_rec)=&get_joint_exon($c_exon,$cc_exon); # if ($c_overlap==1) {$c_exon=$c_rec; # } else {&trace("error in joining alg",4);} # } # for in joining rewritten exons # # put common exon into arr # $c_exon_num=0; my $aux_c_exon_num=0; # print "rewritten_stack_depth=$#rewritten_stack",$eol_str; # print $rewritten_stack[0],$eol_str; # print $rewritten_stack[1],$eol_str; # foreach $cc_exon (@proj_arr) # { if ($c_exon_num==$rewritten_stack[0]) # {push @aux_proj_arr,$c_exon;$aux_c_exon_num++} # else {if ($c_exon_num==$rewritten_stack[$aux_c_exon_num]) # {$aux_c_exon_num++} # else {push @aux_proj_arr,$proj_arr[$c_exon_num]} # } #end else # $c_exon_num++; # } # } #end if rewritten stack ## print "c_joint: @aux_proj_arr\n"; &press_any_key; # @proj_arr=@aux_proj_arr; # } #end foreach 1 # } #end if non empty set # print "joint: @proj_arr\n"; # producing nonoverlapped classes # first produce projection, nonchalance for comparing twice # alternative algoritm to produce projection my @proj_arr=(); my @aux_proj_arr=(); my $c_exon_num=0; my $overlap_flg=1; my $i1=0; my $i2=0; @aux_proj_arr=@rest_exons; @proj_arr=@rest_exons; if ($#rest_exons>0) { while ($overlap_flg==1) {$overlap_flg=0; $i1=0; while (($i1<$#aux_proj_arr) and ($overlap_flg==0)) {$i2=$i1+1; while (($i2<=$#aux_proj_arr) and ($overlap_flg==0)) {$c_exon=$aux_proj_arr[$i1]; $cc_exon=$aux_proj_arr[$i2]; ($c_overlap,$c_rec)=&get_joint_exon($c_exon,$cc_exon); # print "$c_exon & $cc_exon=>$c_rec \n"; if ($c_overlap==1) {$overlap_flg=1; # copiing to proj_arr @proj_arr=(); push @proj_arr,$c_rec; for (my $j1=0; $j1<=$#aux_proj_arr; $j1++) { if (($j1!=$i1) and ($j1!=$i2)) {push @proj_arr,$aux_proj_arr[$j1]} } @aux_proj_arr=@proj_arr; } #end if overlap found $i2++; } #end while second exon in pair $i1++; } #end while first exon in pair } #end while overlap available @proj_arr=@aux_proj_arr; } #end if more then single element in rest_exons # print "joint: @proj_arr\n"; # # second sequentially produce every nonoverlapped class for demonstration # my $class_num=0; #{1 start} # print "out_rest demo: @out_rest_exons\n"; # print " rest demo: @rest_exons\n"; # foreach $c_class (@proj_arr) # {$class_num++; # print "class $class_num: "; # foreach $c_exon (@rest_exons) # {print '(',"$c_exon ",')'; # $c_overlap=&get_overlap_exon($c_class,$c_exon); # if ($c_overlap!=0) {print "$c_exon "} # } # print "\n"; # } #end foreach 1 # third sequentially produce every nonoverlapped class # and check for truncated exons $class_num=0; #{1 start} my $trunc_class_num=0; my @out_rest_class=(); foreach $c_class (@proj_arr) {$class_num++; my $c_5or3_OK_flg=1; @c_class_arr=(); foreach $c_exon (@rest_exons) {$c_overlap=&get_overlap_exon($c_class,$c_exon); if ($c_overlap!=0) {push @c_class_arr,$c_exon } } foreach $c_exon (@c_class_arr) {$c_overlap=&get_5or3_exon($c_class,$c_exon); if ($c_overlap==0) {$c_5or3_OK_flg=0} } if ($c_5or3_OK_flg==1) {$trunc_class_num++} else {push @out_rest_class,$c_class} } #end foreach 1 my $others_num=$#out_rest_class+1; # print "trunc=$trunc_class_num others=$others_num\n"; # calculating retained introns among trunc and others exons my $retained_intron_c=0; foreach $c_class (@proj_arr) {my $retained_intron_flg=0; $c_mRNA_ind1=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=1; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; my $c_ex_i=0; my $c_exon1=''; my $c_exon2=''; for ($c_ex_i=0; $c_ex_i<$#aux_rec1_stack;$c_ex_i++) {$c_exon1=$aux_rec1_stack[$c_ex_i]; $c_exon2=$aux_rec1_stack[$c_ex_i+1]; $c_overlap=&get_overlap_exon($c_exon1,$c_class); # print "overlap=$c_overlap\n"; if ($c_overlap!=0) { $c_overlap=&get_overlap_exon($c_exon2,$c_class); if ($c_overlap!=0) {$retained_intron_flg=1} # assume exon1 and exon2 do not overlap } #end is first overlap # print "aux=$aux_c_flg\n"; } # end comparing exons to class for specific mRNA # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant if ($retained_intron_flg==1) {$retained_intron_c++} } #end foreach class # print "retained_intron_c=$retained_intron_c\n"; # calculating retained introns among others exons my $retained_intron_others_c=0; foreach $c_class (@out_rest_class) {my $retained_intron_others_flg=0; $c_mRNA_ind1=0; foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=1; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; $c_ex_i=0; $c_exon1=''; $c_exon2=''; for ($c_ex_i=0; $c_ex_i<$#aux_rec1_stack;$c_ex_i++) {$c_exon1=$aux_rec1_stack[$c_ex_i]; $c_exon2=$aux_rec1_stack[$c_ex_i+1]; $c_overlap=&get_overlap_exon($c_exon1,$c_class); # print "overlap=$c_overlap\n"; if ($c_overlap!=0) { $c_overlap=&get_overlap_exon($c_exon2,$c_class); if ($c_overlap!=0) {$retained_intron_others_flg=1} # assume exon1 and exon2 do not overlap } #end is first overlap # print "aux=$aux_c_flg\n"; } # end comparing exons to class for specific mRNA # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant if ($retained_intron_others_flg==1) {$retained_intron_others_c++} } #end foreach class # print "retained_intron_c=$retained_intron_c (in others=$retained_intron_others_c)\n"; # calculating retained introns among others exons # and calculating the number of such introns (not only islands) # for debug $retained_intron_others_c=0; my $retained_intron_others_cc=0; foreach $c_class (@out_rest_class) {my $retained_intron_others_flg=0; $c_mRNA_ind1=0; my @retained_intron_arr=(); foreach (@mRNA_stack) { $c_mRNA_rec1=$_; # print "s>>>@mRNA_stack <<>>$c_mRNA_rec1 <<<";&press_any_key; if ($mRNA_stack_errs[$c_mRNA_ind1]==0) { #print ">>>$c_mRNA_rec1 <<<";&press_any_key; $aux_c_flg=1; $c_mRNA_rec1=~s/[\s\n]//g; $c_mRNA_rec1=&expand_rec_1nuc2($c_mRNA_rec1); while ($c_res1=($c_mRNA_rec1=~m/\(/)) {$c_mRNA_rec1=$';} $c_mRNA_rec1=~m/\)/; $c_mRNA_rec1=$`; @aux_rec1_stack=split/,/, $c_mRNA_rec1; $c_ex_i=0; $c_exon1=''; $c_exon2=''; for ($c_ex_i=0; $c_ex_i<$#aux_rec1_stack;$c_ex_i++) {$c_exon1=$aux_rec1_stack[$c_ex_i]; $c_exon2=$aux_rec1_stack[$c_ex_i+1]; $c_overlap=&get_overlap_exon($c_exon1,$c_class); # print "overlap=$c_overlap\n"; if ($c_overlap!=0) { $c_overlap=&get_overlap_exon($c_exon2,$c_class); if ($c_overlap!=0) {$retained_intron_others_flg=1; ($c_start1,$c_end1,$c_err1)=&get_feature_start_end2($c_exon1); ($c_start2,$c_end2,$c_err2)=&get_feature_start_end2($c_exon2); push(@retained_intron_arr,$c_end1.'..'.$c_start2); } # assume exon1 and exon2 do not overlap } #end is first overlap # print "aux=$aux_c_flg\n"; } # end comparing exons to class for specific mRNA # print "aux=$aux_c_flg aux_skip=$aux_skip_exon_flg\n";&press_any_key; } #end if good mRNA record $c_mRNA_ind1++; } #end foreach mRNA variant if ($retained_intron_others_flg==1) {$retained_intron_others_c++; $retained_intron_others_cc=$retained_intron_others_cc+ &num_diff_items(@retained_intron_arr); } } #end foreach class # print "retained_intron_c=$retained_intron_c (in others=$retained_intron_others_c($retained_intron_others_cc))\n"; #printing and store section $aux_out_str='const='.$c_irremovable_exons_c.' alt='.$c_alternative_exons_c.' '; $aux_out_str.='diff='.$diff_alt_c.' skip='.$skipped_exon_c.' '; $aux_val=$skipped_exon_c-$marginal_skipped_exons_c; $aux_out_str.='int='.$aux_val.' '; $aux_out_str.='trunc='.$trunc_class_num.' others='.$others_num.' '; $aux_out_str.='ret_i='.$retained_intron_c.' ('; $aux_out_str.=$retained_intron_c-$retained_intron_others_c.'/'; $aux_out_str.=$retained_intron_others_c.'('.$retained_intron_others_cc.'))'; # print "gene=$aux_gene_name OK_mRNA_c=$OK_mRNA_c\n"; # print $aux_out_str,"\n"; if ($OK_mRNA_c>=2) {$total_alt_mRNA_gene_c++; $total_OK_mRNA_in_alt_mRNA_gene_c+=$OK_mRNA_c; $total_irremovable_exons_c+=$c_irremovable_exons_c; $total_alternative_exons_c+=$c_alternative_exons_c; $total_diff_alt_c+=$diff_alt_c; $total_skipped_exon_c+=$skipped_exon_c; $total_skipped_internal+=$skipped_exon_c-$marginal_skipped_exons_c; $total_trunc_class_num+=$trunc_class_num; $total_others_num+=$others_num; $total_retained_intron_c+=$retained_intron_c; $total_retained_intron_in_trunc_c+=$retained_intron_c-$retained_intron_others_c; $total_retained_intron_others_c+=$retained_intron_others_c; $total_retained_intron_others_cc+=$retained_intron_others_cc; } if ($OK_mRNA_c>=2) { &trace("gene=$aux_gene_name OK_mRNA_c=$OK_mRNA_c",4); &trace($aux_out_str,4)} #&press_any_key; # third add CDS feature for every CDS feature $i=0;$CDS_flg=0;$CDS_line_flg=0; $c_CDS_num=0; my $CDS_exon_num=0; my $mRNA_num=0; my $mRNA_match_c=0; my $err_code=0; my $SPL_note_rec=' /SPL_note="mRNA assignment controversial"'; my $aux_rec=''; my $aux_RNA_block=''; while($i<= $#temp_g_b){ if ($temp_g_b[$i] =~ /^\s{5}CDS\s{1}/){ $CDS_line_flg=1; $cds_feature_counter++; $coding_gene_flg=1; #a19 if ($CDS_flg==1) { # print '>>',$CDS_block,'<<',$eol_str; &press_any_key;#s13 ($mRNA_num,$CDS_exon_num,$mRNA_match_c,$err_code)= &check_mRNA_block_CDS_block_rel_gb149($CDS_block); # print '$mRNA_num=',$mRNA_num,' $CDS_exon_num=',$CDS_exon_num, # ' $mRNA_match_c=',$mRNA_match_c,' $err_code=',$err_code,$eol_str; # &press_any_key; #s13 if ($err_code==0) { if ($CDS_exon_num>1) { $aux_c1++; if ($mRNA_num>0) { $j=0; foreach (@long_mRNA_stack) {$j++; if (($j==$mRNA_num) && ($mRNA_stack_errs[$j-1]==0)) {if ($mRNA_match_c>1) {$aux_rec=$SPL_note_rec."\n"} else {$aux_rec="\n"} $aux_str=$gene_rec_end.$_.$aux_rec.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; last} } if ($mRNA_match_c>1) {&trace("$mRNA_match_c matched mRNA found",4); &trace($gene_block,0); $mRNA_match_controvers_c++; $mRNA_assing_controvers_flg=1; } } else {&trace("no matched mRNA found",4); $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY mRNA_NF"'."\n"; $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; &trace("mRNA considered equ CDS",4); &trace($gene_block,0); $mRNA_assign_err_c++; } } else {&trace("intronless CDS",4); &trace($gene_block,0) } } else {&trace("CDS format error",4); &trace($gene_block,0); $CDS_format_error_c++;} #a19 # print "$CDS_block\n"; # &press_any_key; $CDS_block=''; } $c_CDS_num++; $CDS_flg=1; } elsif ($temp_g_b[$i] =~ /^\s{5}\w+\s{1}/){ $CDS_line_flg=0} if ($CDS_line_flg==1) {$CDS_block.="$temp_g_b[$i]\n"} $i++; }#SPL end while if ($CDS_flg==1) { # print '>>',$CDS_block,'<<',$eol_str; &press_any_key;#s13 ($mRNA_num,$CDS_exon_num,$mRNA_match_c,$err_code)= &check_mRNA_block_CDS_block_rel_gb149($CDS_block); # print '$mRNA_num=',$mRNA_num,' $CDS_exon_num=',$CDS_exon_num, # ' $mRNA_match_c=',$mRNA_match_c,' $err_code=',$err_code,$eol_str; # &press_any_key; #s13 if ($err_code==0) { if ($CDS_exon_num>1) { $aux_c1++; if ($mRNA_num>0) { $j=0; foreach (@long_mRNA_stack) {$j++; if (($j==$mRNA_num) && ($mRNA_stack_errs[$j-1]==0)) {if ($mRNA_match_c>1) {$aux_rec=$SPL_note_rec."\n"} else {$aux_rec="\n"} $aux_str=$gene_rec_end.$_.$aux_rec.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block;last} } if ($mRNA_match_c>1) {&trace("$mRNA_match_c matched mRNA found",4); &trace($gene_block,0); $mRNA_match_controvers_c++; $mRNA_assing_controvers_flg=1; } } else {&trace("no matched mRNA found",4); $aux_RNA_block=$CDS_block; $aux_RNA_block=~s/ CDS / mRNA /; $aux_RNA_block.=' /SPLnote="DUMMY mRNA_NF"'."\n"; $aux_str=$gene_rec_end.$aux_RNA_block.$mRNA_rec_end; push @CDS_blocks,$gene_block.$aux_str.$CDS_block; &trace("mRNA considered equ CDS",4); &trace($gene_block,0); $mRNA_assign_err_c++; } } else {&trace("intronless CDS",4); &trace($gene_block,0) } } else {&trace("CDS format error",4); &trace($gene_block,0); $CDS_format_error_c++} } if ($mRNA_assing_controvers_flg) {$mRNA_match_controvers_gene_c++} if ($err_code!=0) {$aux_c3++} if (($err_code==0) and ($CDS_exon_num>1) and ($mRNA_num>0)) {$err_code=0} else {$err_code=1} #this line has no meaning return $err_code; } sub scan_entire_contig_vs_CDS{ my $op_res=0; my $aux_gene_name2='???'; my $mRNA_rec_end="_mRNA_REC_END_\n"; $CDS_block=~m/$mRNA_rec_end/; my $c_CDS_block=$'; my $c_start=0; my $c_end=0; my $c_err=0; my $c_start2=0; my $c_end2=0; my $c_err2=0; my $gene_counter_in_contig2=0; my $overlap=0; my $old_gene_num2=0; my $overlap_flg=0; # overlap of the gene with any other #a19 my $c_CDS2=0; # this c_CDS2 numerates on every CDS in second gene # while c_CDS indicates on every CDS except intronless # in first gene my $aux_CDS_block=&purge_feature_record($c_CDS_block); $aux_CDS_block =~s/[\s\n]//g; $aux_CDS_block=&expand_rec_1nuc($aux_CDS_block); # print "$aux_CDS_block\n";&press_any_key; ($c_start,$c_end,$c_err)=&get_feature_start_end2($aux_CDS_block); foreach $gene_block2 (@gene_blocks){ # print "gb2>>>\n"; # print $gene_block2,"\n"; # print "<<=0) { $c_CDS2=0; foreach $CDS_block2 (@CDS_blocks2){ $c_CDS2++; $CDS_block2=~m/$mRNA_rec_end/; my $c_CDS_block2=$'; my $aux_CDS_block2=&purge_feature_record($c_CDS_block2); $aux_CDS_block2 =~s/[\s\n]//g; $aux_CDS_block2=&expand_rec_1nuc($aux_CDS_block2); # print "$aux_CDS_block2\n";&press_any_key; ($c_start2,$c_end2,$c_err2)=&get_feature_start_end2($aux_CDS_block2); if (($c_err == 0) and ($c_err2 == 0)) {$overlap=&get_overlap($c_start,$c_end,$c_start2,$c_end2); # if ($overlap>0) {print "$aux_gene_name $aux_gene_name2 $overlap\n"; # &press_any_key;} if (($overlap>0) and ($gene_counter_in_contig !=$gene_counter_in_contig2)) {$locus_overlap_c++; $overlap_flg=1; #a19 if (($gene_counter_in_contig !=$old_gene_num) and ($gene_counter_in_contig2 !=$old_gene_num2)) {$locus_overlap_c_in_genes++; $old_gene_num=$gene_counter_in_contig; $old_gene_num2=$gene_counter_in_contig2; } # print "$aux_gene_name $aux_gene_name2 $overlap\n"; &trace("locus overlap: $aux_gene_name $aux_gene_name2 $overlap",4); # &press_any_key; } if (($overlap==0) and ($gene_counter_in_contig ==$gene_counter_in_contig2)) {if ($#CDS_blocks2>=0 ){ $CDS_non_overlap_event_c++; if ($gene_counter_in_contig2 !=$old_gene_num3) { $CDS_non_overlap_c++; $old_gene_num3=$gene_counter_in_contig2; } # print "$c_start _ $c_end vs $c_start2 _ $c_end2 "; # print "$c_CDS vs $c_CDS2 "; print "nonoverlapped CDS in $aux_gene_name2\n"; &trace("nonoverlapped CDS in $aux_gene_name2",4); } # end if ($#CDS_blocks2>=0 ) } # end if overlap==0 } #end end if c_err==0 # ++$cds_counter; # $CDS_c_str=$OK_gene_counter; #previous $gene_counter; } #end foreach $CDS_block2 } #end if ($#CDS_blocks2>=0) } #end foreach $gene_block2 if ($overlap_flg==1) {$locus_overlapped_c++} #a19 } #end sub sub get_overlap #(c_start,$c_end,$c_start2,$c_end2); ret overlap_len {my $c_ovarlap=0; my $auxl=0; $auxr=0; $c_start=$_[0];$c_end=$_[1];$c_start2=$_[2];$c_end2=$_[3]; $auxl=&max($c_start,$c_start2); $auxr=&min($c_end,$c_end2); if ($auxl<=$auxr) {$c_ovarlap=$auxr-$auxl+1} return $c_ovarlap; } sub get_overlap_exon #(rec1, rec2) ; ret 1 if overlap { my $a1; my $a2;my $err; my $b1; my $b2; my $c_overlap=0; ($a1,$a2,$err)= &get_feature_start_end2($_[0]); ($b1,$b2,$err)= &get_feature_start_end2($_[1]); $c_overlap=get_overlap($a1,$a2,$b1,$b2); if ($c_overlap>0) {$c_overlap=1}; return $c_overlap; } sub get_joint_exon #(rec1, rec2) ; ret 1 if overlap, joint exon rec { my $a1; my $a2;my $err; my $b1; my $b2; my $c_overlap=0; my $auxl=0; $auxr=0; my $joint_rec=''; ($a1,$a2,$err)= &get_feature_start_end2($_[0]); ($b1,$b2,$err)= &get_feature_start_end2($_[1]); $c_overlap=get_overlap($a1,$a2,$b1,$b2); if ($c_overlap>0) {$c_overlap=1; # print "$a1,$a2,$b1,$b2\n"; $auxl=&min($a1,$b1); $auxr=&max($a2,$b2); $joint_rec=$auxl."..".$auxr; # print "$joint_rec\n"; } #end if return $c_overlap,$joint_rec; } sub get_5or3_exon #(proj, member_of_class) ; ret 1 if 5', 2 if 3' { my $a1; my $a2;my $err; my $b1; my $b2; my $c_overlap=0; ($a1,$a2,$err)= &get_feature_start_end2($_[0]); ($b1,$b2,$err)= &get_feature_start_end2($_[1]); if ($a1==$b1) {$c_overlap=1}; if ($a2==$b2) {$c_overlap=2}; return $c_overlap; } sub num_diff_items #(@c_arr); ret number_of_different_elements {my @sorted=sort(@_); my $num_c=0; my $aux_flg=0; my $c_item=''; if ($#sorted<1) {$num_c=$#sorted+1} else { $c_item=$sorted[0];$num_c=1; foreach (@sorted) {if ($c_item ne $_) {$num_c++;$c_item=$_} } #end foreach item } #end else return $num_c; } sub test_overlap {print &get_overlap(1,3,2,4)," 2 \n"; } sub test_joint_rec {&get_joint_exon('1..3','2..3')," \n"; } sub test_diff_num {print &num_diff_items('1..3','1..3','1..3','1..3','1..3','2..3','2..3','uuu')," \n"; } sub min #(va1,val2) ret min { my $min; if ($_[0]<$_[1]) {$min=$_[0]} else {$min=$_[1]} return $min; } sub max #(va1,val2) ret max { my $max; if ($_[0]<$_[1]) {$max=$_[1]} else {$max=$_[0]} return $max; } sub check_CDS_gene_names #(c_locus_flg) { my $c_locus_flg=$_[0]; my $mRNA_rec_end="_mRNA_REC_END_\n"; my $c_CDS_gene_name='ZZZ'; $CDS_block=~m/$mRNA_rec_end/; my $aux_CDS_rec=$'; my $CDS_name_op_res=($aux_CDS_rec=~/\/(gene|locus_tag)=\"([\040-\176]+)\"/); if ($CDS_name_op_res) {$c_CDS_gene_name=$2} #EMBL_GB else {$CDS_name_op_res=($aux_CDS_rec=~/\/(gene|locus_tag)=\s*([\041-\176]+)/); if ($CDS_name_op_res) {$c_CDS_gene_name=$2} else {$c_CDS_gene_name='ZZZ'} } if ($aux_gene_name ne $c_CDS_gene_name) {$CDS_gene_name_mismatch_c++; $in_gene_CDS_gene_name_mismatch_c++; &trace("CDS gene name mismatch in $aux_gene_name",4); if ($c_locus_flg==0) { $CDS_gene_name_mismatch_locus_c++; $c_locus_flg=1; } } return $c_locus_flg; } sub check_mRNA_block_CDS_block_rel #($CDS_block) { #SPL new subroutine to mRNA_block for relation to specific CDS my $CDS_block=$_[0]; my $cds_flg=0; #SPL 1 = cds block available my $aux_CDS_block=''; my $err_code=0; my $mRNA_rec=''; my $match_found_flg=0; my $c_match_found_flg=0; my $longest_mRNA=0; my $longest_mRNA_ind=0; my $mRNA_st=0; my $mRNA_en=0; my $mRNA_match_c=0; my $c_pseudo_flg=0; #... # purge CDS feature # print "CDS_block>>>\n"; # print "$CDS_block"; # print "<<>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+)/$1/g); $corr_CDS_exon_num++; if ($corr_CDS_exon_num!=$CDS_exon_num) { $CDS_exon_num_err_c++; if (($corr_CDS_exon_num==2) && ($CDS_exon_num==1)) { $CDS_exon_num_equ1_err_c++; if (defined($prot_id)) {$CDS_exon_num_equ21_err_defpid_c++;} else {$CDS_exon_num_equ21_err_nopid_c++;} } if (defined($prot_id)) {$CDS_exon_num_err_defpid_c++;} else {$CDS_exon_num_err_nopid_c++;} } if ($corr_CDS_exon_num!=$corr_CDS_exon_num2) { $aux_CDS_exon_num_diff_c++; } if ($CDS_exon_num==0) {$CDS_exon_num_equ0_c++} # print '$CDS_exon_num=', $CDS_exon_num,' ',$corr_CDS_exon_num, # ' ',$corr_CDS_exon_num2, "\n"; # &press_any_key; if ($CDS_exon_num==1) {$intronless_c++} if ($corr_CDS_exon_num==1) {$corr_intronless_c++; if (defined($prot_id)) {$corr_intronless_defpid_c++} } # print "corr_CDS_exon_num=$corr_CDS_exon_num\n"; if (($corr_CDS_exon_num==1) && ($c_pseudo_flg==1)) {$corr_intronless_CDS_pseudo_c++; if (defined($prot_id)) {$corr_intronless_CDS_pseudo_defpid_c++;} } if (($corr_CDS_exon_num>1) && ($c_pseudo_flg==1)) {$corr_spliced_CDS_pseudo_c++; if (defined($prot_id)) {$corr_spliced_CDS_pseudo_defpid_c++;} } if ($corr_CDS_exon_num==1) { #old :$CDS_exon_num $aux_CDS_block =~/(\d+)\.\.[<>]?(\d+)/; my $CDS_exon_st=$1; my $CDS_exon_en=$2; $i=0; $mRNA_match_c=0; $spliced_mRNA_avail_flg=0; foreach $mRNA_rec (@mRNA_stack) { $i++; $c_match_found_flg=0; $mRNA_rec =~s/[\s\n]//g; $mRNA_rec=&expand_rec_1nuc($mRNA_rec); while (my $c_res=($mRNA_rec=~/(\d+)\.\.[<>]?(\d+)/g)) {if (($1<=$CDS_exon_st) && ($2>=$CDS_exon_en)) {$c_match_found_flg=1;$mRNA_match_c++;last} } # print "$mRNA_rec $c_match_found_flg $CDS_exon_st $CDS_exon_en\n"; &press_any_key; if ($c_match_found_flg==1) {$match_found_flg=1} if ($c_match_found_flg==1) {($mRNA_st,$mRNA_en,$err_code)=&get_feature_start_end2($mRNA_rec); if ($longest_mRNA<($mRNA_en-$mRNA_st+1)) {$longest_mRNA=$mRNA_en-$mRNA_st+1; $longest_mRNA_ind=$i;} } if ($c_match_found_flg==1) { #02.12.04 my $mRNA_exon_num=($mRNA_rec=~s/,/,/g); #26.12.04 # print "mRNA_exon_num=$mRNA_exon_num\n"; if ($mRNA_exon_num>=1) {$alt_mRNA_while_CDS_intronless_c++; $spliced_mRNA_avail_flg=1; } #02.12.04 } #02.12.04 } #end for each if ($spliced_mRNA_avail_flg==1) {$spliced_mRNA_while_CDS_intronless_c1++; if (defined($prot_id)) {$spliced_mRNA_while_CDS_intronless_defpid_c1++} } } elsif ($corr_CDS_exon_num>=1) { my $CDS_st; my $CDS_end; ($CDS_st,$CDS_end,$err_code)=&get_feature_start_end2($aux_CDS_block); $aux_CDS_block =~/\d+\.\./; $aux_CDS_block='..'.$'; # print "aux_CDS_block>>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+\)*$/\.\./; # print "aux_CDS_block>>>\n"; #s13 # print "$aux_CDS_block"; #s13 # print "<<>>',4); &trace($CDS_block,4); &trace('<<>>',4); # &trace($aux_CDS_block,4); # &trace('<<>>',4); &trace("@long_mRNA_stack",4); &trace('<<>>',4); &trace($CDS_block,4); &trace('<<>>',4); # &trace($aux_CDS_block,4); # &trace('<<>>\n"; # print "$CDS_block"; # print "<<>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+)/$1/g); $corr_CDS_exon_num++; if ($corr_CDS_exon_num!=$CDS_exon_num) { $CDS_exon_num_err_c++; if (($corr_CDS_exon_num==2) && ($CDS_exon_num==1)) { $CDS_exon_num_equ1_err_c++; if (defined($prot_id)) {$CDS_exon_num_equ21_err_defpid_c++;} else {$CDS_exon_num_equ21_err_nopid_c++;} } if (defined($prot_id)) {$CDS_exon_num_err_defpid_c++;} else {$CDS_exon_num_err_nopid_c++;} } if ($corr_CDS_exon_num!=$corr_CDS_exon_num2) { $aux_CDS_exon_num_diff_c++; } if ($CDS_exon_num==0) {$CDS_exon_num_equ0_c++} # print '$CDS_exon_num=', $CDS_exon_num,' ',$corr_CDS_exon_num, # ' ',$corr_CDS_exon_num2, "\n"; # &press_any_key; if ($CDS_exon_num==1) {$intronless_c++} if ($corr_CDS_exon_num==1) {$corr_intronless_c++; if (defined($prot_id)) {$corr_intronless_defpid_c++} } # print "corr_CDS_exon_num=$corr_CDS_exon_num\n"; if (($corr_CDS_exon_num==1) && ($c_pseudo_flg==1)) {$corr_intronless_CDS_pseudo_c++; if (defined($prot_id)) {$corr_intronless_CDS_pseudo_defpid_c++;} } if (($corr_CDS_exon_num>1) && ($c_pseudo_flg==1)) {$corr_spliced_CDS_pseudo_c++; if (defined($prot_id)) {$corr_spliced_CDS_pseudo_defpid_c++;} } if ($corr_CDS_exon_num==1) { #old :$CDS_exon_num $aux_CDS_block =~/(\d+)\.\.[<>]?(\d+)/; my $CDS_exon_st=$1; my $CDS_exon_en=$2; $i=0; $mRNA_match_c=0; $spliced_mRNA_avail_flg=0; foreach $mRNA_rec (@mRNA_stack) { $i++; if ($mRNA_stack_errs[$i-1] ==0) { $c_match_found_flg=0; $mRNA_rec =~s/[\s\n]//g; $mRNA_rec=&expand_rec_1nuc2($mRNA_rec); $mRNA_rec=&rec_to_exon_order($mRNA_rec); while (my $c_res=($mRNA_rec=~/(\d+)\.\.[<>]?(\d+)/g)) {if (($1<=$CDS_exon_st) && ($2>=$CDS_exon_en)) {$c_match_found_flg=1;$mRNA_match_c++;last} } # print "$mRNA_rec $c_match_found_flg $CDS_exon_st $CDS_exon_en\n"; &press_any_key; if ($c_match_found_flg==1) {$match_found_flg=1} if ($c_match_found_flg==1) {($mRNA_len,$err_code)=&get_feature_start_end3($mRNA_rec); if ($modeCDS eq 'genome') { #getting pre-mRNA length (ver 1.36) ($mRNA_st,$mRNA_en,$err_code)=&get_feature_start_end2($mRNA_rec); $err_code=0; # dummy clear err_code; if ($mRNA_en>$mRNA_st) {$mRNA_len2=$mRNA_en-$mRNA_st+1} else {$mRNA_len2=$mRNA_st-$mRNA_en+1}; # &trace("mRNA_len2=$mRNA_len2",0); #deb 1.36 # print "mRNA_len2=$mRNA_len2",$eol_str; $mRNA_len=$mRNA_len2; } if ($longest_mRNA<$mRNA_len) {$longest_mRNA=$mRNA_len; $longest_mRNA_ind=$i;} } if ($c_match_found_flg==1) { #02.12.04 my $mRNA_exon_num=($mRNA_rec=~s/,/,/g); #26.12.04 # print "mRNA_exon_num=$mRNA_exon_num\n"; if ($mRNA_exon_num>=1) {$alt_mRNA_while_CDS_intronless_c++; $spliced_mRNA_avail_flg=1; } #02.12.04 } #02.12.04 }#end if OK mRNA_stack_err } #end for each if ($spliced_mRNA_avail_flg==1) {$spliced_mRNA_while_CDS_intronless_c1++; if (defined($prot_id)) {$spliced_mRNA_while_CDS_intronless_defpid_c1++} } } elsif ($corr_CDS_exon_num>=1) { my $CDS_st; my $CDS_end; my $CDS_len; # print "aux_CDS_block>>>\n"; # print "$aux_CDS_block"; # print "<<>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+\)*$/\.\./; my $last_ddot_pos=rindex($aux_CDS_block,'..'); if ($last_ddot_pos) {$aux_CDS_block=substr($aux_CDS_block,0,$last_ddot_pos+2)} # print "aux_CDS_block>>>\n"; #s13 # print "$aux_CDS_block"; #s13 # print "<<$mRNA_st) {$mRNA_len2=$mRNA_en-$mRNA_st+1} else {$mRNA_len2=$mRNA_st-$mRNA_en+1}; # &trace("mRNA_len2=$mRNA_len2",0); #deb 1.36 # print "mRNA_len2=$mRNA_len2",$eol_str; $mRNA_len=$mRNA_len2; } if ($longest_mRNA<$mRNA_len) {$longest_mRNA=$mRNA_len; $longest_mRNA_ind=$i;} } #end if OK mRNA }#end if index } #end if OK mRNA_stack_errs } #end foreach #print '$mRNA_match_c=',$mRNA_match_c,'curr $err_code=',$err_code,$eol_str;&press_any_key; #s13 # temporary out further check # if ($mRNA_match_c>1) {# scan all OK1 mRNA # $mRNA_match_c2=0; # foreach $i (@stack_OK1_mRNA) { # $mRNA_rec=$mRNA_stack[$i-1]; # $mRNA_rec =~s/[\s\n]//g; # $mRNA_rec=&expand_rec_1nuc($mRNA_rec); # $mRNA_rec=&rec_to_exon_order($mRNA_rec); # print "mRNA_rec->>>\n"; # print "$mRNA_rec"; # print "<<<-mRNA_rec\n"; # &press_any_key; # print "aux_CDS_block->>>\n"; #s13 # print "$aux_CDS_block"; #s13 # print "<<<-aux_CDS_block\n"; #s13 # &press_any_key; #s13 # my $left_end_OK=0; # my $right_end_OK=0; ##old while (my $c_res=($mRNA_rec=~/(\d+)\.\.[<>]?(\d+)/g)) ##old {if (($1<=$CDS_st) && ($2>=$CDS_st)) {$left_end_OK=1;} ##old if (($1<=$CDS_end) && ($2>=$CDS_end)) {$right_end_OK=1;} #o27 ??? ##old } #end while exon ##new sub for left and right end # my @aux_CDS_items2=split ",",$aux_CDS_block; # pop(@aux_CDS_items2); my $aux_CDS_block2=join ",",@aux_CDS_items2; # $aux_CDS_block2.=","; # $mRNA_rec=~/$aux_CDS_block2/; # #it is impossible to use argument with single '(' within it # my $mRNA_left_aux=$`; # my $mRNA_right_aux=$'; # print "mRNA_right_aux>>>\n"; # print "$mRNA_right_aux"; # print "<<=$CDS_st) {$left_end_OK=1}} # else {$mRNA_left_aux=~/(\d+)$/; # $first_cod_exon=$1; if ($1<=$CDS_st) {$left_end_OK=1;}} # my @aux_CDS_items=split ",",$aux_CDS_block; # my $last_CDS_item=$aux_CDS_items[-1]; # if ($c_res=($mRNA_right_aux=~/complement\(\w+\:(\d+)\.\.$/)) # {$mRNA_right_aux=~/^(\d+)/; # $last_cod_exon=$1; if ($1<=$CDS_end) {$right_end_OK=1}} # else {$mRNA_right_aux=~/^(\d+)/; # $last_cod_exon=$1; if ($1>=$CDS_end) {$right_end_OK=1}} # # if (($left_end_OK==1) && ($right_end_OK==1)) # {$mRNA_match_c2++;push (@stack_OK2_mRNA,$i);} # } #end stack ## print "$mRNA_match_c $mRNA_match_c2\n"; ## &press_any_key; # if ($mRNA_match_c > $mRNA_match_c2) { # if ($mRNA_match_c2>=1) {$total_improvements_in_second_round++; # &trace("second round improvement in $aux_gene_name",4); # print "second round improvement in $aux_gene_name\n"; # } # if ($mRNA_match_c2==1) {$final_improvements_in_second_round++;} # if ($mRNA_match_c2==0) {$number_fails_in_second_round++; # &trace("second round failed in $aux_gene_name",0); # print "second round failed in $aux_gene_name\n"; # } # } # if (($mRNA_match_c > $mRNA_match_c2) && ($mRNA_match_c2>1)) # {# re-search longest mRNA # $longest_mRNA=0; # $longest_mRNA_ind=0; # $i=0; $mRNA_match_c=0; # foreach $i (@stack_OK2_mRNA) { # $mRNA_rec=$mRNA_stack[$i-1]; # $mRNA_rec =~s/[\s\n]//g; # $mRNA_rec=&expand_rec_1nuc($mRNA_rec); # $mRNA_rec=&rec_to_exon_order($mRNA_rec); # ($mRNA_len,$err_code)=&get_feature_start_end3($mRNA_rec); # if ($longest_mRNA<$mRNA_len) # {$longest_mRNA=$mRNA_len; # $longest_mRNA_ind=$i;} ## print ("CDS=$c_CDS_num mRNA= $i longest=$longest_mRNA ind=$longest_mRNA_ind"); ## &press_any_key; # } #end foreach # $mRNA_match_c=$mRNA_match_c2; # to adjust to return # } #end if less new matches # if ($mRNA_match_c2==1) # {$longest_mRNA_ind=$stack_OK2_mRNA[0]; # print " ind=$longest_mRNA_ind\n"; # $mRNA_match_c=1; # } # if ($mRNA_match_c2>1) # {# see about all mRNA equal # my $all_mRNA_equ_flg=1; # my $start_i=$stack_OK2_mRNA[0]; # my $start_mRNA_rec=$mRNA_stack[$start_i-1]; # $start_mRNA_rec =~s/[\s\n]//g; # $start_mRNA_rec=&expand_rec_1nuc($start_mRNA_rec); ## print "$start_mRNA_rec\n"; # foreach $i (@stack_OK2_mRNA) { ## print "start_i=$start_i i=$i\n"; # $mRNA_rec=$mRNA_stack[$i-1]; # $mRNA_rec =~s/[\s\n]//g; # $mRNA_rec=&expand_rec_1nuc($mRNA_rec); # $mRNA_rec=&rec_to_exon_order($mRNA_rec); ## print "$mRNA_rec\n"; # if ($start_mRNA_rec ne $mRNA_rec) {$all_mRNA_equ_flg=0;} ## $start_i=$i; #extra if all equ ## print ("CDS=$c_CDS_num mRNA= $i longest=$longest_mRNA ind=$longest_mRNA_ind"); ## &press_any_key; # } #end foreach ## &press_any_key; # if ($all_mRNA_equ_flg==1) {&trace("third round: mRNA equivalent",4); # print " third round: mRNA equivalent\n"; # $mRNA_match_c=1; # to adjust to return # $all_mRNA_equ_c++; # } #end if all mRNA equ # } #end if two or more matches # } # end if ($mRNA_match_c>1) # scan all OK1 mRNA } else {$err_code=1;$aux_c4++; };#!!! ##print 'internal $err_code=',$err_code,$eol_str;&press_any_key; if ($longest_mRNA_ind==0) { &trace('mRNA assign event:',4); &trace('$corr_CDS_exon_num='. $corr_CDS_exon_num,4); my $out_CDS_block=''; if ($CDS_block=~/ \/translation/) {$out_CDS_block=$`} else {$out_CDS_block=$CDS_block} if ($out_CDS_block=~/ \/db_xref/) {$out_CDS_block=$`} &trace('CDS_block>>>',4); &trace($out_CDS_block,0); &trace('<<>>',4); # &trace($aux_CDS_block,4); # &trace('<<>>',4); &trace("@long_mRNA_stack",0); &trace('<<>>',4); &trace($CDS_block,0); &trace('<<>>',4); # &trace($aux_CDS_block,4); # &trace('<<>>\n"; # print "$CDS_block"; # print "<<>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+)/$1/g); # print '$CDS_exon_num=', $CDS_exon_num,"\n"; # &press_any_key; if ($CDS_exon_num==1) {$intronless_c++} elsif ($CDS_exon_num >1) {} else { &trace("gene=$aux_gene_name",0); &trace("aux_CDS_block>>>",0); &trace("$aux_CDS_block",0); &trace("<<>>',"\n"; ## print $short_rec; ## print '<<>>\n"; # print "$aux_CDS_block"; # print "<<]?\d+)/$1/g); my $i=0; while (my $c_res=($aux_short_feature=~/(\d+)\.\.[<>]?(\d+)/g)) {$i++; if ($i==1) {$c_start=$1} if ($i==$item_num) {$c_end=$2} } my $err_code=0; if (($c_start<1) || ($c_end<$c_start)) {$err_code=1;} return ($c_start,$c_end, $err_code) } sub get_feature_start_end2 #($short_feature); return (start, end, err_code) #SPL new subroutine: extracting feature start and end #SPL no cross-reference allowed (no check) #SPL assuming spaces and new lines signes already deleted {my $aux_short_feature=$_[0]; my $c_start=0; my $c_end=0; my $item_num=($aux_short_feature =~s/(\d+\.\.[<>]?\d+)/$1/g); # print "feature>>>\n"; # print "$aux_short_feature"; # print "<<]?(\d+)/g)) {$i++; if ($i==1) {$c_start=$1} if ($i==$item_num) {$c_end=$2} } my $err_code=0; if (($c_start<1) || ($c_end<$c_start)) {$err_code=1;} return ($c_start,$c_end, $err_code) } sub get_feature_start_end3 #($short_feature); return (exon_len_sum, err_code) #SPL new subroutine: calculating exon_len_sum #SPL cross-reference allowed (it is not obligatory to get mRNA_start and end #SPL since it is problematic if cross-reference is present) #SPL assuming spaces and new lines signes already deleted {my $aux_short_feature=$_[0]; my $c_start=0; my $c_end=0; my $c_len=0; my $err_code=0; my $item_num=($aux_short_feature =~s/(\d+\.\.[<>]?\d+)/$1/g); # print "feature>>>\n"; # print "$aux_short_feature"; # print "<<]?(\d+)/g)) {$i++; if (($1<1) || ($2<1)) {$err_code=1} #ORI SPL if (($1<1) || ($2<$1)) {$err_code=1;} if ($2<$1) {$c_len=$c_len+($1-$2+1)} else {$c_len=$c_len+($2-$1+1)}; } return ($c_len, $err_code) } sub get_feature_start_end4 #($short_feature); return (err_code) #SPL new subroutine: calculating exon_len_sum #SPL cross-reference allowed (it is not obligatory to get mRNA_start and end #SPL since it is problematic if cross-reference is present) #SPL not assuming spaces and new lines signes already deleted {my $aux_short_feature=$_[0]; my $c_start=0; my $c_end=0; my $c_ref=''; my $prev_start=0; my $prev_end=0; my $auxl=0; my $auxr=0; my $err_code=0; $aux_short_feature=~s/[\s\n]//g; my $c_res=($aux_short_feature=~m/^complement/); if ($c_res) {$c_res=($aux_short_feature=~s/^complement\(//); $c_res=($aux_short_feature=~s/\)$//); } #end if c_res $c_res=($aux_short_feature=~s/^join\(//); if ($c_res) {$c_res=($aux_short_feature=~s/\)$//)} my @c_items=split (/\,/,$aux_short_feature); my $item_num=($aux_short_feature =~s/(\d+\.\.[<>]?\d+)/$1/g); # print "feature>>>\n"; # print "$aux_short_feature"; # print "<<]?(\d+)/); if ($c_res) {$prev_start=$1;$prev_end=$2} else {$err_code=2} if (($prev_start<1) || ($prev_end<$prev_start)) {$err_code=1} $i++; while (($i<=$#c_items) && ($err_code==0)) { $c_res=($c_items[$i]=~/([A-Za-z0-9_.]+)\:/); if ($c_res) {$c_ref=$1} else {$c_ref=''} $c_res=($c_items[$i]=~/(\d+)\.\.[<>]?(\d+)/); if ($c_res) {$c_start=$1;$c_end=$2} else {$err_code=2} if ($err_code==0) { if (($c_start<1) || ($c_end<$c_start)) {$err_code=1;} # print "$prev_ref vs $c_ref"; &press_any_key; if ($c_ref eq $prev_ref) { $auxl=&max($prev_start,$c_start); $auxr=&min($prev_end,$c_end); if ($auxl<=$auxr) {$err_code=3} } #end same reference $prev_ref=$c_ref; $prev_start=$c_start; $prev_end=$c_end; } #end if err_code == 0 $i++; } #end while #print "mRNA err_code=$err_code";&press_any_key; return ($err_code) } sub check_cds{ ### at this point we have: $cdsline, $codon_start, $translation, $locusline, $defline my @exonsizes; my $j; my $exonsize; my $total_exon_size = 0; my @intronposition; my @intronnt; my @protein; my @introns; my @phase; my $intronno; my $exonno; # if(!$translation) { # return ""; # } # @protein = ("", split(//, $translation)); # print '>>>'; #SPL # &print_cds_line;print "\n"; #SPL # print length($cdsline),"\n"; #SPL $cdsline =~ s/[\w\.]+\://g; $cdsline =~ s/\s+//g; # &print_cds_line;print "\n"; #SPL @intr_pairs = split(/\,/,$cdsline); $intronno = $#intr_pairs; $exonno = $intronno + 1; foreach $intr_pair (@intr_pairs){ # print $intr_pair,$eol_str; #SPL exons anyway (@temp) = ($intr_pair =~ /(\d+)/g); # print "@temp,\n"; #SPL ($#temp == 0) && (push(@temp, $temp[0])); ($#temp > 1) && die "strange cdsline: /$intr_pair/:$cdsline\n\t$entrylines[1]\n"; push(@introns, @temp); } for($j = 0; $j < $#introns; $j += 2){ $exonsize = $introns[$j + 1] - $introns[$j] + 1; push(@exonsizes, $exonsize); $total_exon_size += $exonsize; } if($cdsline =~ /CDScomplement\(join/){ @exonsizes = reverse(@exonsizes); } $exonsizes[0] -= $codon_start; $total_exon_size -= $codon_start; # $diff = $total_exon_size / 3 - $#protein; $diff = 0; if($diff < -1 || $diff > 1){ return ""; } return "\n$cds_info\n"; } sub get_CDS_c_str #(c_num) ret new CDS_c_str; {my $c_num=$_[0]; my $aux_str=''; my $h26; my $l26; if ($c_num>0) { if ($c_num>26*27) {$aux_str='ZZ'} else {$h26=($c_num-1)/26; $l26=($c_num-1) % 26; if ($h26<1) {$aux_str=chr($l26+65)} else {$aux_str=chr($h26+64).chr($l26+65)} } } return $aux_str; } sub test_get_CDS_c_str {print "1 ",&get_CDS_c_str(1),"\n"; print "2 ",&get_CDS_c_str(2),"\n"; print "26*27-1 ",&get_CDS_c_str(26*27-1),"\n"; print "26*27 ",&get_CDS_c_str(26*27),"\n"; print "999 ",&get_CDS_c_str(999),"\n"; &press_any_key; return } #-------------- sub trace #(c_message, line_indent) {my $auxs; if ($_[1]>=0) {$auxs=' 'x$_[1];} print TRACEF $auxs,$_[0],"\n"; } #-------------- sub trace_stat #(c_message, line_indent) {my $auxs; if ($_[1]>=0) {$auxs=' 'x$_[1];} print STATF $auxs,$_[0],"\n"; } #-------------- sub test_expand_1nucl {my $CDS_rec='join(20,23..24)';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(20,23)';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(20),23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(L2:20,23)';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:20),23)';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(20,complement(23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(<11,complement(L2:23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(<1,complement(L2:23..>24))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:1), 31)';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:<1),complement(L2:23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:<1..10),complement(L2:23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:<1),complement(L2:>23))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='join(complement(L2:<1..10),complement(L2:23..>31))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='complement(join((L2:<1..10),(L2:23..>31)))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; $CDS_rec='complement(join((L2:<10),(L2:>31)))';$CDS_rec=&expand_rec_1nuc2($CDS_rec); print "$CDS_rec\n";&press_any_key; } #-------------- sub expand_rec_1nuc #(c_rec) # example: join(1..20,55) -> join(1..20,55..55) { my $c_rec=$_[0]; my $c_res=($c_rec=~s/([^\0560-9]{1})(\d+),/$1$2\.\.$2,/g); $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) # ignore the case of single nucleotide only return $c_rec; } #-------------- sub expand_rec_1nuc2 #(c_rec) # example: join(1..20,55) -> join(1..20,55..55) { my $c_rec=$_[0]; my $c_res=($c_rec=~s/([^\0560-9<>]{1})(\d+),/$1$2\.\.$2,/g); $c_res=($c_rec=~s/([^\0560-9<>]{1})<(\d+)(\)?),/$1<$2\.\.$2$3,/g); #jan05 # $c_res=($c_rec=~s/\((<{1})(\d+),/\($1$2\.\.$2,/g); #jan05 $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) # ignore the case of single nucleotide only $c_res=($c_rec=~s/([^\0560-9<>]{1})(\d+)\)/$1$2\.\.$2\)/g); $c_res=($c_rec=~s/([^\0560-9<>]{1})>(\d+)\)/$1$2\.\.>$2\)/g); # $c_rec=~s/,(\d+)([^\0560-9]{1})/,$1\.\.$1$2/g; #\056=period (point) return $c_rec; } #-------------- sub test_rec_to_exon_order {my $CDS_rec='complement(AL:1..2)'; $CDS_rec=&rec_to_exon_order($CDS_rec); print "$CDS_rec\n";&press_any_key; } #-------------- sub rec_to_exon_order #(c_rec) # example: complement(join(1..20,55..56)) -> # join(c(55..56),c(1..20)); #ASSUME purge_feature_record passed #ASSUME single use of complement and join operators #ASSUME valid syntax { my @c_f; my $c_ref=''; my $c_rec=$_[0]; my $c_res=($c_rec=~m/^complement/); if ($c_res) {$c_res=($c_rec=~s/^complement\(//); $c_res=($c_rec=~s/\)$//); $c_res=($c_rec=~s/^join\(//); if ($c_res) {$c_res=($c_rec=~s/\)$//)} @c_f=split /,/,$c_rec; @c_f=reverse @c_f; foreach $exon_rec (@c_f) { $c_res=($exon_rec=~/(\d+)\.\.[<>]?(\d+)/); #$exon_rec=$`.$2."\.\.".$1.$'; $exon_rec=$2."\.\.".$1; #v. 1.53e } $c_rec=join ",", @c_f; } elsif ($c_res=($c_rec=~m/^join/)) { $c_res=($c_rec=~s/^join\(//); $c_res=($c_rec=~s/\)$//); @c_f=split /,/,$c_rec; foreach $exon_rec (@c_f) { $c_ref=''; if ($exon_rec=~m/^complement/) { $exon_rec=~s/^complement\(//; if ($exon_rec=~m/\:/) {$c_ref=$`.':';$exon_rec=$';} $c_res=($exon_rec=~/(\d+)\.\.[<>]?(\d+)/); $exon_rec=$c_ref.$2."\.\.".$1; } } $c_rec=join ",", @c_f; } return $c_rec; } #-------------- sub print_by_page #(block_of_lines) {my $p_len=10; my $c_c=0; my $cp_c=1; my @block_of_lines=split ("\n",$_[0]); while ($c_c<=$#block_of_lines) {print $block_of_lines[$c_c],"\n"; $c_c++;$cp_c++; if ($cp_c==$p_len) {$cp_c=1;&press_any_key;} } } #-------------- small lib ---- #-------------- sub print_cds_line #(c_line) {my $c_line=$_[0]; for (my $j=0;$j<=(length($c_line));$j++) {if (ord(substr($c_line,$j,1))!=13) {print substr($c_line,$j,1)}} } #-------------- sub chop_if_chr #(string;chr) {my $loc_string=$_[0]; #print 'char=',ord($_[1]),' string=',$loc_string,' substr=',ord(substr ($loc_string,-1,1)); if (substr ($loc_string,-1,1) eq $_[1]) {chop($loc_string);} #print 'after last=',ord(substr ($loc_string,-1,1)); return $loc_string; } #-------------- sub del_trail_spaces #($loc_l) {$_=$_[0]; s/ +$//;$_=$_} #deleting trailing spaces #-------------- sub purge_eol #($loc_l) {my $loc_l=$_[0]; print 'i: ',$loc_l,' ',length($loc_l); chomp($loc_l); print 'chomp: ',$loc_l,' ',length($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); #print 'chopic: ',$loc_l,' ',length($loc_l); } #-------------- sub purge_eol_sp #($loc_l) {my $loc_l=$_[0]; chomp($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); $loc_l=&del_trail_spaces($loc_l); } #-------------- sub press_any_key { read STDIN,my $a_key,1; $a_key='';} #---- end of lib ----- #------- REVISION HISTORY -------------------------- # ver 1.03 20.10.04 tracing, debug in intronless CDS # ver 1.04 21.10.04 minor debug # ver 1.05 22.10.04 more statistics # ver 1.06 01.11.04 correction for CDS exon length=1 # represented by single nucleotide pos # ver 1.07 07.11.04 flow chart correction # error statistics correction # ver 1.08 11.11.04 special subroutines for handling # all CDS without wings # ver 1.09 17.11.04 store entries with stop codons, # statistics for those events, # scan for CDS overlap around contig # ver 1.10 19.11.04 scan for CDS nonoverlap around gene, # scan for CDS gene name contradiction, # dismiss alt splicing after CDS gene name test # ver 1.11 19.11.04 debug CDS nonoverlap around gene # debug CDS gene name mismatch and statistics # ver 1.12 02.12.04 debug check_mRNA_block_CDS_block_rel # (second and third round) # SPL_note="mRNA assignment controversial" # ver 1.13 11.12.04 mark CDS with not founf mRNA as # SPLnote="DUMMY mRNA NF" # ver 1.14 16.12.04 additional statistics for intronless CDS # ver 1.15 17.12.04 additional statistics for no protein_id events # statistics for correct CDS_exon_num, # corrected CDS_exon_num # ver 1.16 17.12.04 debug mRNA_exon_num # ver 1.17 15.01.05 no trace intronless_c since # corr_intronless_c is valid # ver 1.18 19.08.05 statistics file introduced # minor bugs fixed # ver 1.19 04.09.05 correction type misprint effected # to lost lines in first part of statistics # in tEID output # ver 1.20 11.09.05 GB149 # ver 1.21 13.09.05 reference (nothing special) # ver 1.22 15.09.05 PSEUDO # ver 1.23 18.09.05 "locus_tag=" handling # ver 1.24 25.10.05 GB149 mRNA assignment # ver 1.25 01.11.05 --- " --- # ver 1.26 08.11.05 --- " --- ----------- new version # ver 1.27 18.11.05 --- " --- ----------- debug in first end last # CDS exon # ver 1.28 25.12.05 GB149 out contig_scan # - " - out name mismatch # no output /translation in CDS block in .tEID # ver 1.29 26.12.05 debug rec_to_exon_order # ver 1.30 28.12.05 debug get_gene_block # ver 1.31 01.01.06 new version of expand_rec_1nuc2, # mRNA_stack_errs, # corrected check_mRNA_block_CDS_block_rel_gb149, # new get_feature_start_end4: getting format error # ver 1.32 02.01.06 statistics of rejected mRNA records # no output from /db_xref till end of CDS block # in .tEID # corrected get_feature_start_end4 # ver 1.33 05.01.06 debug expand_rec_1nuc2 # ver 1.34 29.06.06 fixing errors debugged in GB149 version # ver 1.35 11.07.06 CDS and mRNA block: no leading 4 spaces # ver 1.36 19.07.06 returning to longest pre-mRNA in GB149 subroutines # under genome global selector # Note. GB149 always longest mRNA as exon sum # Since it is impossble to calculate pre-mRNA # length in the case external sequence reference # ver 1.37 23.08.06 fixing version after all the tests made # ver 1.37e 01.09.06 alternative exon count:starting # ver 1.38e 01.09.06 first working # ver 1.39e 01.09.06 another definition of alternative exons # ver 1.40e 13.09.06 skipped, truncated and other exons, started. # ver 1.41e 21.09.06 skipped debugged, get_exon_overlap debugged # ver 1.42e 21.09.06 exon projection # ver 1.43e 21.09.06 truncated and other exons, finished, skipped temp out # ver 1.44e 25.09.06 bug in truncated and other exons # ver 1.45e 25.09.06 truncated and other exons debug # ver 1.46e 07.10.06 retained intron among truncated exon and others classes # ver 1.47e 18.10.06 (1) retained intron among others classes prototype # (in class number) # (2) retained intron among others classes prototype # (in intron number) # ver 1.48e 19.10.06 marginal skipped exon number prototype # ver 1.49e 20.10.06 debug previous ver, print individual stat, # mRNA_ind1++ position correction # ver 1.50e 20.10.06 debug previous ver, print total stat. # ver 1.51e 20.10.06 brushing, out debug prints, total stat in stat file # ver 1.52e 20.10.06 minor modification # ver 1.53e 08.04.07 modified [<>] signs handling in rec_to_exon_order: # [<>] are omitted but mentioned in statistics as # CDS_incomplete; # modified format of exon statistics # ver 1.54e 18.07.07 corrected mRNA word in stat files # ver 1.55e 26.07.07 damping invalid format in EMBL formatted to GenBank # in line /gene=gene_name # in gene record, that was made without double quotation # which leads to empty gene_name # mark is EMBL_GB # ver 1.56e 05.08.07 rewritten proc produce projection in exon count block # since previous version has a bug # #------- KNOWN PROBLEMS ---------------------------- # 1. exon feature is not used # 2. Coordinate of the kind one-of-the-interval is not handled # and considered as a format error, example: (23.25)..35 #---------------------------------------------------