#!/usr/local/perl #updated 6/23/2014 #This program was created to calculate the total number of genomic variatns differences between each pair of individuals from the same population. #$chunk =$ARGV[3]*100000 +1; # The genomes length of each individual is about 3 billion line.The next step is to process teh genomes of each pair of individuals in parts(chunks) and these chunks will be processed in parallel(at the same time) to reduce the rpocessing time. "bpg-n server is able to process about 22 programs at the same time with 100% speed capacity" $chunk = $ARGV[1]*1000000 +1; #$chunk will be 1 million line for each chunk. $dir = 'JULY';# $dir is the directory that will be used to save all of the results. #print "$chunk \n"; mkdir "./$dir", 0750 unless -d "./$dir"; open(OUT1, ">./$dir/$ARGV[0]_$ARGV[1]_A")|| die "Can't open 1_1 output : $!\n"; # open(OUT2, ">./$dir/$ARGV[0]_$ARGV[2]_$ARGV[3]_B")|| die "Can't open 2_2 output : $!\n"; #$str = localtime(); #print OUT1 "$str\n"; #@pair_id = (); open (FILE1, "TSI_Genome_HET.txt") || die "Can't open zipped file chr22 : $!\n";#TSI_Genome_HET.txt is the .txt table that has all autosomes data of individuals belongs to TSI population. $line_N = 0; while (){ $line_N++; if ($line_N == 1){ chop $_; @ID = split(/\t/, $_);#@ID will contain a list of identifiers extracted from the desired population table #$n = 1; #foreach (@ID) { #print "$n\t$_\n";$n++; #} last; } } #new edit to test for two individuals. #@pair_id = ('NA19328', 'NA19312');#TWo GBR individual to test for $x (0..$#ID) { for $y (0..$#ID) { if ($ID[$x] eq $ID[$y]) { push(@columns, $x); # #end of the edit. } } } #@columns will contain the columns number of each individua identifier. for $n(0..$#columns){ #print $columns[$n]."\n"; } #The next step is to open the population genomes file of interest. open (FILE, "TSI_Genome_HET.txt") || die "Can't open zipped file chr21 : $!\n"; $line_N = 0; #$w, $z two individuals from the analyzed population(e.g. GBR) @diff = (); while (){ $line_N++; if ($line_N >= $chunk + 1000000) {last;}# this step is to define the number of the chunk to process. if ($line_N > 1 && $line_N >= $chunk) { # print $line_N "\n"; # print $line_N."\n";_ # if ($line_N > 1){ # print "my god\n"; @e = split(/\t/, $_); # print $e[1]."\n"; for $w (0..$#columns){ for $z (0..$#columns){ if ($w < $z){ # @e = split(/\t/, $_); if ($e[$columns[$w]] =~/^(\d)\|(\d)\:(.+)\:(.+)\,(.+)\,(.+)/){ $p1 = $1 + $2;#$1=parent1 Allele , $2=Parent2 Allele. # print $p1."\n"; } if ($e[$columns[$z]] =~/^(\d)\|(\d)\:(.+)\:(.+)\,(.+)\,(.+)/){ $p2 = $1 + $2; # print $p2."\n"; } $diff[$w][$z] += abs ($p1 - $p2);#diff is the total number of the genomic differences between each pair of individuals(individual1 =$z, Individual2=$w). # $my = abs ($p1 - $p2); # } # print $my."\n"; } } } } } for $w (0..$#columns) { for $z (0..$#columns) { if ($w < $z) { print OUT1 "$columns[$w]\t$ID[$columns[$w]]\t$columns[$z]\t$ID[$columns[$z]]\t$diff[$w][$z]\n"; } } } #$str2 = localtime(); #print OUT1 "$str2\n";