#!/usr/local/perl
#program created by Alexei Fedorov in Nov 2014. Its goal to calculate locations of shared vrGVs for two individuals
#command line for starting the program:  perl 2individualsGenomeDif_vrGVs.pl Arg1 Arg2 Arg3
#Arg1 is a fragment of the output filename; Arg2 is identifier of individual #1 from the 1092 sequenced genomes;  Arg3 is the indentifier of the second individual
#example of  the command line:  nohup perl GenomeDiff_2ind_vrGV_v3.pl CHS_CHS NA18548 NA18567 &
#the output file for this example will have the name: Table5S_CHS_CHS_NA18548_NA18567 

$c=0; @b =();
open(OUT, ">TableS5_$ARGV[0]_$ARGV[1]_$ARGV[2]");
open(IN, "1000genomes_ID");
while(<IN>) {
	@b=split(/\t/, $_); 
}
print "$#b \n";
@pair_id = ($ARGV[1], $ARGV[2]);
@columns2 =();
for $x (0..$#b) {
        for $y (0..$#pair_id) {
                if ($b[$x] eq $pair_id[$y]) {push(@columns2, $x); print "$x \t $b[$x] \t $#columns2 \n";}
        }
}
print OUT "columns2 $columns2[0] \t $columns2[1] \n";

$match =0;
#open (FILE, "zcat /home/asahama/ALL.chr12.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz |") || die "could not open file \n";
for $L (1..22) {
	$name = '/home/afedorov/rareSNP_chr'.$L.'.gz';
	open (FILE, "zcat $name |") || die "could not open file \n";
	$line_N =0;
	#$w, $z two individuals from the analyzed population(e.g. GBR)
	while (<FILE>){
		$line_N++;  
        	if ($line_N > 30) {
			$count = () = $_ =~ /\|1/g;
			$count2 = () = $_ =~ /1\|/g;
			if ($count + $count2 >4 ){next;}
				@e=split(/\t/, $_);
				if ($e[$columns2[0]] =~/^(\d)\|(\d)/){$p1 = $1 + $2;}
				else {print "problem with the entry $e[$columns2[0]] \n";}
				if ($e[$columns2[1]] =~/^(\d)\|(\d)/){$p2 = $1 + $2;}
       				else {print "problem with the entry $e[$columns2[1]] \n";}
				if ($p1 >0 && $p2 >0) {print OUT "CHR$e[0]\t$e[1]\t$e[2]\t$e[$columns2[0]]\t$e[$columns2[1]]\t$e[3]\t$e[4]\n"; $match++; }
		}
	}
}
print OUT "number of matches of vrGVs is $match \n";