#!/pkg/bin/perl -w # June 9, 2006 Convert to bbhmisstest.pl incorporating the idea that Dan Brown suggested # bbhmisstesthk.pl # July 20, 2005 # Modified to solve produce the ILP whose solution solves # the haplotyping problem so that the HK bound for the resulting haplotypes is # minimized over all possible haplotyping solutions. # # June 4, 2005 changed use of stringlen, right and left. This should have no affect on # the output when all columns are selected, but fixed a bug when only an interval is #selected, and also the logic is clearer now. # #December 5, 2004 # use diagnostics # Generates the ILP to solve the problem of # setting the missing values in order to minimize the number of # resulting incompatible pairs of sites. # open IN, "$ARGV[0]"; open OUT, ">constraints"; open OUT1, ">objective"; open OUT2, ">trace"; # for debugging use $line1 = ; $line1 =~ /(\d+)/; $setnumber = $1; $line2 = ; chomp $line1; chomp $line2; @lines = ; $line = $lines[1]; #print "$line"; chomp $line; $line =~ tr/ //d; $stringlen = length ($line); print "$stringlen ***\n"; %count = (); $left = 0; $right = $stringlen-1; # right and left now define the range of the indices into the data. $li = 0; foreach $line (@lines) { $hli = 2*$li; $hlines[$hli] = $line; print "$hli, $hlines[$hli]\n"; $hli++; $hlines[$hli] = $line; print "$hli, $hlines[$hli]\n"; $li++; } #print "do the whole range or an interval - input d for whole range or two numbers for #an interval\n"; #$line = ; #if ($line =~ /d/) { #$left = 0; #$right = $stringlen-1; # right and left now define the range of the indices into the data. #} #elsif ($line =~ /(\d+) (\d+)/) { # $left = $1-1; # $right = $2-1; # print "got those numbers\n"; #} foreach $i ($left...$right -1) { # note that the column indices begin at 1. foreach $j ($i+1...$right) { $ii = $i+1; $jj = $j+1; $key = $ii . ',' . $jj; # print OUT "$key\n"; %{$hash{$key}} = (); $count{$key} = 0; } } print OUT1 "minimize \n"; # print OUTM "maximize \n"; $linecount = 1; foreach $line (@hlines) { $line =~ tr/ //d; chomp $line; @chars = split(//, $line); foreach $i ($left...$right -1) { foreach $j ($i+1...$right) { $ii = $i+1; $jj = $j+1; $key = $ii . ',' . $jj; $value = $chars[$i] . ',' . $chars[$j]; if (($chars[$i] eq '2') or ($chars[$j] eq '2')) { # print OUT "$i, $j \n"; $variable{$key} = 1; $vhash{$linecount}{$key} = $value; # print OUT "Another $key; $value, $count{$key}\n"; if ($chars[$i] ne '2') { # July 31, 2005 this is a kluge and a bass-ackwards way to do this, # but given the existing code, it is a quick hack and seems to # work - anyway, the speed of this code is not critical. Rather # it is the solution speed of the ILP produced that matters. $value0 = "$chars[$i],0"; $value1 = "$chars[$i],1"; if (! defined $hash{$key}{$value0}) { $hash{$key}{$value0} = 1; $count{$key}++; } if (! defined $hash{$key}{$value1}) { $hash{$key}{$value1} = 1; $count{$key}++; } } if ($chars[$j] ne '2') { $value0 = "0,$chars[$j]"; $value1 = "1,$chars[$j]"; if (! defined $hash{$key}{$value0}) { $hash{$key}{$value0} = 1; $count{$key}++; } if (! defined $hash{$key}{$value1}) { $hash{$key}{$value1} = 1; $count{$key}++; } } } elsif (! defined $hash{$key}{$value}) { # in this case, the tuple does not involve a 2, and # we want to record the binary type seen that col. pair $hash{$key}{$value} = 1; $count{$key}++; # this accumulates the number of binary types seen so far in that col. pair. # print OUT "Another $key; $value, $count{$key}\n"; } } } $linecount++; # this ends up overcounting the number of lines by one. } $count = 0; foreach $ekey (sort keys %count) { print OUT2 "XX $ekey $count{$ekey} \n"; if ($count{$ekey} == 4) { print OUT "LB$ekey = 1\n"; # this puts in an LB variable set to 1 for # every pair that is already in conflict. Remove this to # just compute the extra conflicts rather than the total conflicts. $count++; # this counts the number of pairs of cols. that are in conflict even ignoring the 2's ($ii,$jj) = split(/,/, $ekey); foreach $kk ($ii..$jj-1) { print OUT "+ R$kk "; $bin{"R$kk"} = 1; $Rvar{"R$kk"} = 1; if (0 == (($kk - $ii) % 8)) { print OUT "\n"; } } print OUT "-LB$ekey => 0\n"; } } $linecount = 0; foreach $line (@hlines) { $linecount++; $line =~ tr/ //d; chomp $line; @chars = split(//, $line); foreach $i ($left...$right) { $ii = $i+1; if ($chars[$i] eq '2') { print OUT "Y$linecount,$ii <= 1\n"; $bin{"Y$linecount,$ii"} = 1; if (0 == $linecount %2 ) { # if this is the second line of a variable pair # then add in the constraint that exactly one # of the variables must be set to 1 and the other to 0. $lcm1 = $linecount - 1; print OUT "Y$linecount,$ii + Y$lcm1,$ii = 1\n"; } } elsif ($chars[$i] eq '1') { print OUT "Y$linecount,$ii = 1\n"; } else { print OUT "Y$linecount,$ii = 0\n"; } } } $k = 0; $useful = 0; $paircount = $subcount = $sub22count = 0; foreach $i ($left...$right -1) { foreach $j ($i+1...$right) { $ii = $i+1; $jj = $j+1; $key = $ii . ',' . $jj; if ((defined $variable{$key}) and ($count{$key} < 4)) { # we only further examine a pair of columns if they have some # variable element(s) are not already incompatible $string11 = $string10 = $string01 = $string00 = ""; $c11 = $c10 = $c01 = $c00 = 0; $I11 = $I10 = $I01 = $I00 = 0; $stringforpair = ""; foreach $lin (1..$linecount){ if (defined $vhash{$lin}{$key}) { $paircount++; # print "$paircount\n"; $value = $vhash{$lin}{$key}; if ($vhash{$lin}{$key} eq '2,2') { if (!defined $hash{$key}{'1,1'}) { $I11 = 1; $c11++; # this records that 1,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,1 - Y$lin,$ii - Y$lin,$jj => -1\n"; #if the Y variables are set to 1,1 then the D11 # variable must be set to 1 for that column pair. } if (!defined $hash{$key}{'0,0'}) { $I00 = 1; $c00++; # this records that 0,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,0 + Y$lin,$ii + Y$lin,$jj => 1\n"; #if the Y variables are set to 0,0 then the D00 # variable must be set to 1 for that column pair. } if (!defined $hash{$key}{'1,0'}) { $I10 = 1; $c10++; # this records that 1,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,0 - Y$lin,$ii + Y$lin,$jj => 0\n"; #if the Y variables are set to 1,0 then the D10 # variable must be set to 1 for that column pair. } if (!defined $hash{$key}{'0,1'}) { $I01 = 1; $c01++; # this records that 0,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,1 + Y$lin,$ii - Y$lin,$jj => 0\n"; #if the Y variables are set to 0,1 then the D01 # variable must be set to 1 for that column pair. } # print "2,2\n"; } # end of the 2,2 case elsif ($vhash{$lin}{$key} eq '2,1') { # print "2,1\n"; if ((defined $hash{$key}{'1,1'}) and (defined $hash{$key}{'0,1'})) { # print "No help\n"; } else { if (!defined $hash{$key}{'1,1'}) { $I11 = 1; $c11++; # this records that 1,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,1 = 1 \n"; # Definetely the pair 1,1 will be created in this pair of cols. # $bin{"D$key,1,1"} = 1; } if (!defined $hash{$key}{'0,1'}) { $I01 = 1; $c01++; # this records that 0,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,1 = 1\n"; # $bin{"D$key,0,1"} = 1; } } # end of else } # end of 2,1 case elsif ($vhash{$lin}{$key} eq '2,0') { # print "2,0\n"; if ((defined $hash{$key}{'1,0'}) and (defined $hash{$key}{'0,0'})) { # print "No help\n"; } else { if (!defined $hash{$key}{'0,0'}) { $I00 = 1; $c00++; # this records that 0,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,0 = 1\n"; # $bin{"D$key,0,0"} = 1; } if (!defined $hash{$key}{'1,0'}) { $I10 = 1; $c10++; # this records that 1,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,0 = 1\n"; # $bin{"D$key,1,0"} = 1; } } # end of else } # end of 2,0 case elsif ($vhash{$lin}{$key} eq '1,2') { # print "1,2\n"; if ((defined $hash{$key}{'1,1'}) and (defined $hash{$key}{'1,0'})) { # print "No help\n"; } else { if (!defined $hash{$key}{'1,1'}) { $I11 = 1; $c11++; # this records that 1,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,1 = 1 \n"; #if the Y variables are set to 1,1 then the D11 # variable must be set to 1 for that column pair. # $bin{"D$key,1,1"} = 1; } if (!defined $hash{$key}{'1,0'}) { $I10 = 1; $c10++; # this records that 1,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,1,0 = 1\n"; #if the Y variables are set to 1,0 then the D10 # variable must be set to 1 for that column pair. # $bin{"D$key,1,0"} = 1; } } #end of else } #end of 1,2 case elsif ($vhash{$lin}{$key} eq '0,2') { # print "0,2\n"; if ((defined $hash{$key}{'0,0'}) and (defined $hash{$key}{'0,1'})) { # print "No help\n"; } else { if (!defined $hash{$key}{'0,0'}) { $I00 = 1; $c00++; # this records that 0,0 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,0 = 1\n"; # $bin{"D$key,0,0"} = 1; } if (!defined $hash{$key}{'0,1'}) { $I01 = 1; $c01++; # this records that 0,1 is missing in the pair of columns but can be # created in this row. $stringforpair .= "D$key,0,1 = 1\n"; # $bin{"D$key,0,1"} = 1; } } #end of else } # end of 0,2 case # print OUT "$lin, $key $vhash{$lin}{$key}\n"; # print OUT "Another $key; $value, $count{$key}\n"; } } # end of next line case if ($I11 + $I10 + $I01 + $I00 + $count{$key} == 4) { # here we check whether an incompatibility is possible for that column pair. $useful++; # print OUT "Column pair $ii, $jj is useful\n"; print OUT "$stringforpair\n"; $rstring = ""; $rstring = ""; if ($c11 > 0) { # need to check whether there was a way to make D11 = 1 # That is, if there is a 11 deficiency, and if it can be rectified. $rstring .= "+ D$key,1,1 "; $bin{"D$key,1,1"} = 1; } if ($c10 > 0) { $rstring .= "+ D$key,1,0 "; $bin{"D$key,1,0"} = 1; } if ($c01 > 0) { $rstring .= "+ D$key,0,1 "; $bin{"D$key,0,1"} = 1; } if ($c00 > 0) { $rstring .= "+ D$key,0,0 "; $bin{"D$key,0,0"} = 1; } $rhs = 3 - $count{$key}; # (May 9, 2006) print OUT "$rstring - 4 LB$key => -$count{$key}\n"; # these inequalities set LB$key to 1 if and only if print OUT "$rstring - LB$key <= $rhs\n"; # the Y values are set so that the column pair is # incompatible. The first line is the only if part. # and the second line is the if part. $bin{"LB$key"} = 1; foreach $kk ($ii..$jj-1) { print OUT "+ R$kk "; $bin{"R$kk"} = 1; $Rvar{"R$kk"} = 1; if (0 == (($kk - $ii) % 8)) { print OUT "\n"; } } print OUT "-LB$key => 0\n"; } # else { # print OUT "Column pair $ii, $jj is not useful\n"; # } } } } $kk = 0; foreach $rvar (sort keys %Rvar) { print OUT1 "+ $rvar"; $kk++; if (0 == (($kk - $ii) % 8)) { print OUT1 "\n"; } } print OUT1 "\n st \n"; # print OUTM "\n st \n"; print OUT "binaries\n"; foreach $index (sort keys %bin) { print OUT "$index\n"; } print OUT "end\n"; print "The number of non-perfect pairs is $count\n"; print "The potential number of subpairs is $paircount\n"; print "The subpotential number of subpairs is $subcount\n"; print "The number of useful pairs is $useful\n"; print "The specific subcounts are $sub22count, $sub21count, $sub12count, $sub20count, $sub02count\n"; close (IN); close (OUT); close (OUT1); $lpoutfile = "hhklp$setnumber"; system ("cat objective constraints> $lpoutfile"); print "The output file is $lpoutfile \n";