#!/usr/bin/perl   
# Serge Saxonov, Iraj Daizadeh, Alexei Fedorov and Walter Gilbert
# Department of Molecular and Cellular Biology
# Harvard University
# Cambridge, MA 02138
# Reference: Serge Saxonov, Iraj Daizadeh, Alexei Fedorov, and Walter Gilbert
#       Nucleic Acids Res. 2000 Jan 1;28(1):185-190
#SPL spl notes 
#SPL spl reduction in Sept 2004
#SPL ver 1.02 21.10.04
#SPL ver 1.03 22.10.04
#SPL ver 1.04 26.11.04
#SPL ver 1.05 19.08.05
#SPL ver 1.06 12.09.05


### first_parse.pl --    This is the first parser of GenBank files. It simply 
###                  pulls out all headers with the CDS join feature.
###                  It needs the file "seqfiles.list", which contains
###                  the names of all the .seq files.
my $txt_file_style=1;# =0 DOS-like style; =1 UNIX-like style     #SPL
my $eol_str;                                                     #SPL 
if ($txt_file_style==0) {$eol_str="\r\n";} else {$eol_str="\n";} #SPL

my $aux_name='gun_ZZ__'; #SPL aux file to store gunzipped sq file
#print "checkpoint2 \n";

($#ARGV != -1) || die "Usage: first_parse.pl prefix\n";
$prefix = $ARGV[0];
# print $prefix, "\n";
open(SEQFILES, "SEQFILES.LIS") || die "Could not find the seq files list\n";
#SPL ORI open(SEQFILES, "seqfiles.list") || die "Could not find the seq files list\n";
open(OUT, ">${prefix}.RAW") || die "Could not create ${prefix}.RAW\n";
open(TRACEF, ">>${prefix}.tEID") || die; #SPL trace file already in use
#open(STATF, ">>${prefix}.sEID") || die; # a19 

while($file = <SEQFILES>){
   $file =~ s/\s+//g;
   my @c_name=split(/\./,$file);  #SPL
#   print "@c_name\n";&press_any_key;
   if (substr($file,0,1) ne '#') { #SPL
      if (uc($c_name[-1]) eq 'GZ') {
         $aux_name='gun_ZZ__';
         &trace("gunzip -c $file > $aux_name",0);
         print ("gunzip -c $file > $aux_name\n");
         system("gunzip -c $file > $aux_name");
         $file=$aux_name;
      }
   open(IN, $file) || die "Could not open $file\n";
   &trace("snoopping ".$file,0);   print "snoopping $file\n"; #SPL
   while($line = <IN>){
      if ($line eq $eol_str) {$line = ' '.$eol_str}  #s12 
      if($line =~ /^LOCUS/){                                                     
         $entry = $line;
         while($line = <IN>){
      if ($line eq $eol_str) {$line = ' '.$eol_str}  #s12 
            $entry .= $line;
            if($line =~ /^     CDS             join\(|^     CDS             complement\(join\(/){
               &scan_until_base;
               $id++;
               print OUT "\$ID $id\n";
               print OUT "$entry\n\n\n";
               last;
            }   
            elsif($line =~ /^ORIGIN/){
               last;
            }
         }
      }
   }
   close IN;
   } #end if not comment file  SPL
}
close TRACEF; #SPL
#close STATF; #a19
exit; #SPL

sub scan_until_base{
   while($line = <IN>){
      if ($line eq $eol_str) {$line = ' '.$eol_str}  #s12 
      if($line =~ /^ORIGIN/){
         last;
      }
      else{
         $entry .= $line;
      }
   }
}
exit; #SPL
#--------------
sub trace #(c_message, line_indent)
    {my $auxs;
     if ($_[1]>=0) {$auxs=' 'x$_[1];}
     print TRACEF $auxs,$_[0],"\n";
    }
#-------------- small lib ---- 
#--------------

sub chop_if_chr   #(string;chr)
    {my $loc_string=$_[0];
     #print 'char=',ord($_[1]),' string=',$loc_string,' substr=',ord(substr ($loc_string,-1,1));
     if (substr ($loc_string,-1,1) eq $_[1]) 
        {chop($loc_string);} 
     #print 'after last=',ord(substr ($loc_string,-1,1));

     return $loc_string;
    }
#--------------
sub del_trail_spaces #($loc_l)
    {$_=$_[0]; s/ +$//;$_=$_}  #deleting trailing spaces
#--------------
sub purge_eol #($loc_l)
    {my $loc_l=$_[0];
     print 'i: ',$loc_l,'   ',length($loc_l);
     chomp($loc_l); 
     print 'chomp: ',$loc_l,'   ',length($loc_l);
     $loc_l=&chop_if_chr($loc_l,chr(13));
     #print 'chopic: ',$loc_l,'   ',length($loc_l);
    }
#--------------
sub purge_eol_sp #($loc_l)
    {my $loc_l=$_[0];
     chomp($loc_l); $loc_l=&chop_if_chr($loc_l,chr(13));
     $loc_l=&del_trail_spaces($loc_l);
    }
#--------------
sub press_any_key
    { read STDIN,my $a_key,1; $a_key='';}
#--------------
sub get_hour_min_sec_time_elapsed_line($c_time)
    {my $t_2=time-$t_1;
     my $out_str;
     return $out_str=sprintf ("%6d h %2d m %2d s", 
                           $t_2 / 3600, 
                          ($t_2 % 3600)/60,$t_2 % 60);
    }
#---- end of lib -----
#---- KNOWN PROBLEMS -----
# If producing empty prefix.RAW file further programs fail.
# 
#-------- REVISION HISTORY -----
#  ver 1.02 tracing
#  ver 1.03 remark in SEQ.LIST file
#  ver 1.04 26.11.04  reading gzipped SQ files b35 like hs_ref_chrx.gbk.gz
#  ver 1.05 19.08.05  statistics file introduced
#  ver 1.06 12.09.05  GB149 empty line treating
#-------------------------------