#!/usr/bin/perl # Serge Saxonov, Iraj Daizadeh, Alexei Fedorov and Walter Gilbert # Department of Molecular and Cellular Biology # Harvard University # Cambridge, MA 02138 # Reference: Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert # Nucleic Acids Res. 2000 Jan 1;28(1):185-190 #SPL spl notes #SPL spl reduction in Sept 2004 #SPL ver 1.02 21.10.04 #SPL ver 1.03 22.10.04 #SPL ver 1.04 26.11.04 #SPL ver 1.05 19.08.05 #SPL ver 1.06 12.09.05 ### first_parse.pl -- This is the first parser of GenBank files. It simply ### pulls out all headers with the CDS join feature. ### It needs the file "seqfiles.list", which contains ### the names of all the .seq files. my $txt_file_style=1;# =0 DOS-like style; =1 UNIX-like style #SPL my $eol_str; #SPL if ($txt_file_style==0) {$eol_str="\r\n";} else {$eol_str="\n";} #SPL my $aux_name='gun_ZZ__'; #SPL aux file to store gunzipped sq file #print "checkpoint2 \n"; ($#ARGV != -1) || die "Usage: first_parse.pl prefix\n"; $prefix = $ARGV[0]; # print $prefix, "\n"; open(SEQFILES, "SEQFILES.LIS") || die "Could not find the seq files list\n"; #SPL ORI open(SEQFILES, "seqfiles.list") || die "Could not find the seq files list\n"; open(OUT, ">${prefix}.RAW") || die "Could not create ${prefix}.RAW\n"; open(TRACEF, ">>${prefix}.tEID") || die; #SPL trace file already in use #open(STATF, ">>${prefix}.sEID") || die; # a19 while($file = ){ $file =~ s/\s+//g; my @c_name=split(/\./,$file); #SPL # print "@c_name\n";&press_any_key; if (substr($file,0,1) ne '#') { #SPL if (uc($c_name[-1]) eq 'GZ') { $aux_name='gun_ZZ__'; &trace("gunzip -c $file > $aux_name",0); print ("gunzip -c $file > $aux_name\n"); system("gunzip -c $file > $aux_name"); $file=$aux_name; } open(IN, $file) || die "Could not open $file\n"; &trace("snoopping ".$file,0); print "snoopping $file\n"; #SPL while($line = ){ if ($line eq $eol_str) {$line = ' '.$eol_str} #s12 if($line =~ /^LOCUS/){ $entry = $line; while($line = ){ if ($line eq $eol_str) {$line = ' '.$eol_str} #s12 $entry .= $line; if($line =~ /^ CDS join\(|^ CDS complement\(join\(/){ &scan_until_base; $id++; print OUT "\$ID $id\n"; print OUT "$entry\n\n\n"; last; } elsif($line =~ /^ORIGIN/){ last; } } } } close IN; } #end if not comment file SPL } close TRACEF; #SPL #close STATF; #a19 exit; #SPL sub scan_until_base{ while($line = ){ if ($line eq $eol_str) {$line = ' '.$eol_str} #s12 if($line =~ /^ORIGIN/){ last; } else{ $entry .= $line; } } } exit; #SPL #-------------- sub trace #(c_message, line_indent) {my $auxs; if ($_[1]>=0) {$auxs=' 'x$_[1];} print TRACEF $auxs,$_[0],"\n"; } #-------------- small lib ---- #-------------- sub chop_if_chr #(string;chr) {my $loc_string=$_[0]; #print 'char=',ord($_[1]),' string=',$loc_string,' substr=',ord(substr ($loc_string,-1,1)); if (substr ($loc_string,-1,1) eq $_[1]) {chop($loc_string);} #print 'after last=',ord(substr ($loc_string,-1,1)); return $loc_string; } #-------------- sub del_trail_spaces #($loc_l) {$_=$_[0]; s/ +$//;$_=$_} #deleting trailing spaces #-------------- sub purge_eol #($loc_l) {my $loc_l=$_[0]; print 'i: ',$loc_l,' ',length($loc_l); chomp($loc_l); print 'chomp: ',$loc_l,' ',length($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); #print 'chopic: ',$loc_l,' ',length($loc_l); } #-------------- sub purge_eol_sp #($loc_l) {my $loc_l=$_[0]; chomp($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13)); $loc_l=&del_trail_spaces($loc_l); } #-------------- sub press_any_key { read STDIN,my $a_key,1; $a_key='';} #-------------- sub get_hour_min_sec_time_elapsed_line($c_time) {my $t_2=time-$t_1; my $out_str; return $out_str=sprintf ("%6d h %2d m %2d s", $t_2 / 3600, ($t_2 % 3600)/60,$t_2 % 60); } #---- end of lib ----- #---- KNOWN PROBLEMS ----- # If producing empty prefix.RAW file further programs fail. # #-------- REVISION HISTORY ----- # ver 1.02 tracing # ver 1.03 remark in SEQ.LIST file # ver 1.04 26.11.04 reading gzipped SQ files b35 like hs_ref_chrx.gbk.gz # ver 1.05 19.08.05 statistics file introduced # ver 1.06 12.09.05 GB149 empty line treating #-------------------------------