#!/usr/bin/env perl
use File::Basename;

$lagandir = $ENV{LAGAN_DIR};
$reducedcutoff = 60;
$overlapcutoff = 90;
$overlapwindow = 50;
$anchgapstart = -10;
$anchgapcont = -0.04;
$usebounds = 1;
$contigRetain = 2;
$pid = $$;

# utility subroutines
sub max {
    my ($a, $b) = @_;
    return $a if ($a > $b);
    return $b;    
}

sub min {
    my ($a, $b) = @_;
    return $a if ($a < $b);
    return $b;    
}

# process arguments
if (@ARGV < 4 && @ARGV > 6) {
    print STDERR ("usage:\n cmerge seqfile mfafile draftfile outfile [-nocrop] [-skipfr pid]\n");
    exit(1);
}
$arglist = "";
$nocrop = 0;
for ($i = 4; $i < @ARGV; $i++) {
    if ($ARGV[$i] =~ /-nocrop/){
	$nocrop = 1;
    }
    elsif ($ARGV[$i] =~ /-skipfr/){
	$skipfr = 1;
	$pid = $ARGV[++$i];
	chomp $pid;
    }
    else {
	$arglist = $arglist." ".$ARGV[$i];
    }
}
$arglist = "$arglist $recurfl";

$newdir = `pwd`;
chomp $newdir;
$newdir = "$newdir/$pid";

open (LOGFILE, ">>$newdir/log");
open (INFOFILE, ">>$newdir/minfo");

print STDERR ("\n");
print STDERR ("Computing Contig Overlaps\n");
print STDERR ("-------------------------\n");

print LOGFILE ("\n");
print LOGFILE ("Computing Contig Overlaps\n");
print LOGFILE ("-------------------------\n");

# initialize merged file
open (OFILE, ">$ARGV[3]");
print OFILE (">merged\n");
close (OFILE);

# initialize padding file
open (OFILE, ">$newdir/padding");
print OFILE (">padding\n");
print OFILE ("NNNNNNNNNNNNNNNNNNNN.NNNNNNNNNNNNNNNNNNNN\n");
close (OFILE);

# other initialization
$totlength = `$lagandir/utils/getlength $ARGV[0]`;
chomp $totlength;
$mergedEnd = 0;

# read contig list
$numContigs = 0;
@list = `cat $ARGV[2]`;
for ($i = 3; $i < @list; $i++){
    $list[$i] =~ /(.*)\.mfa --\> \((\d+) (\d+)\) score=(\d+), offset=(\d+), index=(\d+)/;
    $filenames[$i-3] = $1;
    $matchBegin[$i-3] = $2;
    $matchEnd[$i-3] = $3;
    $score[$i-3] = $4;
    $offset[$i-3] = $5;
    $num[$i-3] = $6;

    `$lagandir/utils/getranges $filenames[$i-3].mfa 1 0` =~ /(\d+) (\d+)/;

    $alignBegin[$i-3] = $1 + $offset[$i-3];
    $alignEnd[$i-3] = $2 + $offset[$i-3];
    $alignLength[$i-3] = $alignEnd[$i-3] - $alignBegin[$i-3] + 1;

    $numContigs++;
}

# no cropping method
if ($nocrop){
    @contigs = `perl $lagandir/mextract.pl $ARGV[1]`;
    for ($i = 0; $i < $numContigs; $i++){
	`mv $ARGV[3] $ARGV[3].new`;
	
	if ($i > 0){
	    `$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;
	    `mv $ARGV[3] $ARGV[3].new`;
	}
	`$lagandir/utils/seqmerge $ARGV[3].new $filenames[$i] > $ARGV[3]`;

	print STDERR "After $filenames[$i], size = ".`$lagandir/utils/getlength $ARGV[3]`;
    }
    print STDERR "Merging complete!\n\n";
    print LOGFILE "Merging complete!\n\n";
    
    close (LOGFILE);
    exit (0);
}


# redo contig alignments to reduce end expansion
for ($i = 0; $i < $numContigs; $i++){

    `cp $filenames[$i].mfa $filenames[$i].mfa.new`;

if (0){

    # compute expansion factor based on significantly conserved region
    $leftBoundary = $matchBegin[$i] - $offset[$i];
    $rightBoundary = $matchEnd[$i] - $offset[$i];
    `$lagandir/utils/cextract $filenames[$i].mfa $leftBoundary $rightBoundary 0 1 > $newdir/temp.fa`;
    $matchLength[$i] = `$lagandir/utils/getlength $newdir/temp.fa`;
    $expanFactor[$i] = ($matchEnd[$i] - $matchBegin[$i]) / $matchLength[$i];

    # compute left end size
    $leftBoundary = 1 - $offset[$i];
    $rightBoundary = $matchBegin[$i] - 1 - $offset[$i];
    `$lagandir/utils/cextract $filenames[$i].mfa $leftBoundary $rightBoundary 0 1 > $newdir/temp.fa`;
    $endLeftSize[$i] = `$lagandir/utils/getlength $newdir/temp.fa`;

    # compute right end size
    $leftBoundary = $matchEnd[$i] + 1 - $offset[$i];
    $rightBoundary = $totlength - $offset[$i];
    `$lagandir/utils/cextract $filenames[$i].mfa $leftBoundary $rightBoundary 0 1 > $newdir/temp.fa`;
    $endRightSize[$i] = `$lagandir/utils/getlength $newdir/temp.fa`;

    # compute new trimmed contig boundaries
    $newLeftBoundary[$i] = max (int $matchBegin[$i] - $endLeftSize[$i] * $expanFactor[$i], 1);
    $newRightBoundary[$i] = min (int $matchEnd[$i] + $endRightSize[$i] * $expanFactor[$i], $totlength);

    print STDERR "\nFor contig $filenames[$i]:\n";
    print STDERR "   match length = $matchLength[$i]";
    print STDERR "   match range = ($matchBegin[$i] $matchEnd[$i])\n";
    print STDERR "   expansion factor = $expanFactor[$i]\n";
    print STDERR "   left end size = $endLeftSize[$i]";
    print STDERR "   right end size = $endRightSize[$i]";
    print STDERR "   new alignment range = ($newLeftBoundary[$i] $newRightBoundary[$i])\n";
    print STDERR "   old alignment range = ($alignBegin[$i] $alignEnd[$i])\n\n";

    print LOGFILE "\nFor contig $filenames[$i]:\n";
    print LOGFILE "   match length = $matchLength[$i]";
    print LOGFILE "   match range = ($matchBegin[$i] $matchEnd[$i])\n";
    print LOGFILE "   expansion factor = $expanFactor[$i]\n";
    print LOGFILE "   left end size = $endLeftSize[$i]";
    print LOGFILE "   right end size = $endRightSize[$i]";
    print LOGFILE "   new alignment range = ($newLeftBoundary[$i] $newRightBoundary[$i])\n";
    print LOGFILE "   old alignment range = ($alignBegin[$i] $alignEnd[$i])\n\n";

    # create parameter string
    $contigLength[$i] = `$lagandir/utils/getlength $filenames[$i]`;
    chomp $contigLength[$i];
    $rangeParams = "-s1 $newLeftBoundary[$i] $newRightBoundary[$i] -s2 1 $contigLength[$i]";

    # perform new alignment
    $offset[$i] = $newLeftBoundary[$i] - 1;
    $execute = "perl $lagandir/lagan.pl $ARGV[0] $filenames[$i] $rangeParams -bin $arglist -out $filenames[$i].bin 2> /dev/null";
    $execute = $execute." -gap $anchgapstart $anchgapcont" if ($usebounds);

    `$execute`;
    `$lagandir/utils/bin2mf $filenames[$i].bin > $filenames[$i].mfa.new`;
    `rm $filenames[$i].bin`;

    # read in new parameters for alignment
    `$lagandir/utils/getranges $filenames[$i].mfa.new 1 0` =~ /(\d+) (\d+)/;
    $alignBegin[$i] = $1 + $offset[$i];
    $alignEnd[$i] = $2 + $offset[$i];
    $alignLength[$i] = $alignEnd[$i] - $alignBegin[$i] + 1;
}

}

# extract contigs;
$contigfile = basename ($ARGV[1]);
$contigdir = dirname ($ARGV[1]);
$newdir = `pwd`;
chomp $newdir;
$newdir = "$newdir/$pid";
@contigs = `perl $lagandir/mextract.pl $newdir/$contigfile`;
for ($i = 0; $i < @contigs; $i++){
    chomp $contigs[$i];
    `$lagandir/utils/rc < $contigs[$i] > $contigs[$i].rc`;
}

# filter for false hit contigs
$prevContig = 0;
$used[0] = $used[$numContigs - 1] = 1;
$contigsLeft = $numContigs;

for ($i = 1; $i < $numContigs - 1; $i++){

    $used[$i] = 1;

    if (0){

    $nextContig = $i + 1;

    if ($matchEnd[$prevContig] < $alignBegin[$i] && $alignEnd[$i] < $matchBegin[$nextContig]){
	$used[$i] = 1;
	next;
    }

    # extract concatenation of previous and next overhangs
    $leftBoundary = $matchEnd[$prevContig] + 1 - $offset[$prevContig];
    $rightBoundary = $alignEnd[$prevContig] - $offset[$prevContig];
    `$lagandir/utils/cextract $filenames[$prevContig].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$prevContig] > $newdir/seq1`;

    $leftBoundary = $alignBegin[$nextContig] - $offset[$nextContig];
    $rightBoundary = $matchBegin[$nextContig] - 1 - $offset[$nextContig];
    `$lagandir/utils/cextract $filenames[$nextContig].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$nextContig] > $newdir/seq2`;

    `$lagandir/utils/seqmerge $newdir/seq1 $newdir/seq2 > $newdir/seq3`;

    # extract interval from finished sequence
    $leftBoundary = $matchEnd[$prevContig] + 1;
    $rightBoundary = $matchBegin[$nextContig] - 1;
    `$lagandir/utils/cextract $ARGV[0] $leftBoundary $rightBoundary 0 0 > $newdir/seq1`;

    # align overhangs with finished sequence interval
    `$lagandir/lagan.pl $newdir/seq1 $newdir/seq3 -bin -out $newdir/aligned.bin 2> /dev/null`;
    `$lagandir/utils/bin2mf $newdir/aligned.bin > $newdir/aligned.mfa`;
    `rm $newdir/aligned.bin`;
    @aligned = `$lagandir/utils/countmatch $newdir/aligned.mfa`;
    $aligned[0] =~ /(\d+)/;
    $overhangScore = $1 * $1 / (`$lagandir/utils/getlength $newdir/seq1` * `$lagandir/utils/getlength $newdir/seq3`);
    
    # align contig with finished sequence interval
    `$lagandir/lagan.pl $newdir/seq1 $filenames[$i] -bin -out $newdir/aligned.bin 2> /dev/null`;
    `$lagandir/utils/bin2mf $newdir/aligned.bin > $newdir/aligned.mfa`;
    `rm $newdir/aligned.bin`;
    @aligned = `$lagandir/utils/countmatch $newdir/aligned.mfa`;
    $aligned[0] =~ /(\d+)/;
    $contigScore = $1 * $1 / (`$lagandir/utils/getlength $newdir/seq1` * `$lagandir/utils/getlength $filenames[$i]`);
    
    print STDERR "\nFor contig $filenames[$i]:\n";
    print STDERR "   surrounding overhang match percentage = $overhangScore\n";
    print STDERR "   contig match percentage = $contigScore\n\n";

    print LOGFILE "\nFor contig $filenames[$i]:\n";
    print LOGFILE "   surrounding overhang match percentage = $overhangScore\n";
    print LOGFILE "   contig match percentage = $contigScore\n\n";

    # determine if contig is kept or not -- contigRetain is a measure of how much the overhang score must
    # exceed the contig score before the contig is cropped

    if ($overhangScore > $contigRetain * $contigScore){

	print STDERR "   Contig was removed!\n";
	print LOGFILE "   Contig was removed!\n";
	
	$used[$i] = 0;
	$contigsLeft--;
    }
    else {
	$used[$i] = 1;
	$prevContig = $i;
    }

}
}

# merge contigs

`mv $ARGV[3] $ARGV[3].new`;
`$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;

$leftBoundary = $alignBegin[0] - $offset[0];
$rightBoundary = $matchBegin[0] - 1 - $offset[0];
@temp = `$lagandir/utils/getlength $ARGV[3]`; chomp $temp[0];
$contigStart = $temp[0] + 1;
$startChop = 0;
$secFrom = 1;
`$lagandir/utils/cextract $filenames[0].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[0] > $newdir/seq1`;
`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq1 > $ARGV[3].new`;
`mv $ARGV[3].new $ARGV[3]`;

$j = 0;
for ($i = 1; $i < $numContigs; $i++){

    next if (!$used[$i]);

    print STDERR "\n***** Merging $filenames[$j] and $filenames[$i]...\n\n";
    print STDERR "File name:   $filenames[$j]\n";
    print STDERR "Match range: ($matchBegin[$j] $matchEnd[$j]) (given in finished sequence coordinates)\n";
    print STDERR "Align range: ($alignBegin[$j] $alignEnd[$j]) (given in finished sequence coordinates)\n";
    print STDERR "Match offset: $offset[$j] (number of chars omitted from finished sequence at beginning of alignment)\n\n";
    print STDERR "File name:   $filenames[$i]\n";
    print STDERR "Match range: ($matchBegin[$i] $matchEnd[$i]) (given in finished sequence coordinates)\n";
    print STDERR "Align range: ($alignBegin[$i] $alignEnd[$i]) (given in finished sequence coordinates)\n";
    print STDERR "Match offset: $offset[$i] (number of chars omitted from finished sequence at beginning of alignment)\n\n";

    print LOGFILE "\n***** Merging $filenames[$j] and $filenames[$i]...\n\n";
    print LOGFILE "File name:   $filenames[$j]\n";
    print LOGFILE "Match range: ($matchBegin[$j] $matchEnd[$j]) (given in finished sequence coordinates)\n";
    print LOGFILE "Align range: ($alignBegin[$j] $alignEnd[$j]) (given in finished sequence coordinates)\n";
    print LOGFILE "Match offset: $offset[$j] (number of chars omitted from finished sequence at beginning of alignment)\n\n";
    print LOGFILE "File name:   $filenames[$i]\n";
    print LOGFILE "Match range: ($matchBegin[$i] $matchEnd[$i]) (given in finished sequence coordinates)\n";
    print LOGFILE "Align range: ($alignBegin[$i] $alignEnd[$i]) (given in finished sequence coordinates)\n";
    print LOGFILE "Match offset: $offset[$i] (number of chars omitted from finished sequence at beginning of alignment)\n\n";

    # extract overhangs
    $leftBoundary = $matchEnd[$j] + 1 - $offset[$j];
    $rightBoundary = $alignEnd[$j] - $offset[$j];
    `$lagandir/utils/cextract $filenames[$j].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$j] > $newdir/seq1`;
    @temp = `$lagandir/utils/getlength $newdir/seq1`; chomp $temp[0];
    $leftBoundaryLength = $temp[0];
    
    $leftBoundary = $alignBegin[$i] - $offset[$i];
    $rightBoundary = $matchBegin[$i] - 1 - $offset[$i];
    `$lagandir/utils/cextract $filenames[$i].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$i] > $newdir/seq2`;
    @temp = `$lagandir/utils/getlength $newdir/seq2`; chomp $temp[0];
    $rightBoundaryLength = $temp[0];
    
    # check for non-overlapping ranges
    if ($alignEnd[$j] < $alignBegin[$i]){
	$transition = 0;
    }
    else {

	# compute overhang alignment score
	`$lagandir/lagan.pl $newdir/seq1 $newdir/seq2 -bin -out $newdir/aligned.bin 2> /dev/null`;
	`$lagandir/utils/bin2mf $newdir/aligned.bin > $newdir/aligned.mfa`;
	`rm $newdir/aligned.bin`;
	@aligned = `$lagandir/utils/cstat $newdir/aligned.mfa $overlapcutoff $overlapwindow`;
	$aligned[0] =~ /(\d+)/;
	$overhangScore = $1;

	print STDERR "Overlap score: $overhangScore ($overlapcutoff percent conservation over $overlapwindow bases)\n";
	print LOGFILE "Overlap score: $overhangScore ($overlapcutoff percent conservation over $overlapwindow bases)\n";

	# check for overhang alignment
	if ($overhangScore > 0){

	    # extract interval from finished sequence
	    $leftBoundary = $matchEnd[$j] + 1;
	    $rightBoundary = $matchBegin[$i] - 1;
	    `$lagandir/utils/cextract $ARGV[0] $leftBoundary $rightBoundary 0 0 > $newdir/seq3`;

	    # compute overhang alignment scores
	    `$lagandir/lagan.pl $newdir/seq3 $newdir/seq1 -bin -out $newdir/aligned.bin 2> /dev/null`;
	    `$lagandir/utils/bin2mf $newdir/aligned.bin > $newdir/aligned.mfa`;
	    `rm $newdir/aligned.bin`;
	    @aligned = `$lagandir/utils/cstat $newdir/aligned.mfa $reducedcutoff $overlapwindow`;
	    $aligned[0] =~ /(\d+)/;
	    $overhangLeftScore = $1;

	    print STDERR "Left overhang score: $overhangLeftScore ($reducedcutoff percent conservation over $overlapwindow bases)\n";
	    print LOGFILE "Left overhang score: $overhangLeftScore ($reducedcutoff percent conservation over $overlapwindow bases)\n";
	    
	    `$lagandir/lagan.pl $newdir/seq3 $newdir/seq2 -bin -out $newdir/aligned.bin 2> /dev/null`;
	    `$lagandir/utils/bin2mf $newdir/aligned.bin > $newdir/aligned.mfa`;
	    `rm $newdir/aligned.bin`;
	    @aligned = `$lagandir/utils/cstat $newdir/aligned.mfa $reducedcutoff $overlapwindow`;
	    $aligned[0] =~ /(\d+)/;
	    $overhangRightScore = $1;

	    print STDERR "Right overhang score: $overhangRightScore ($reducedcutoff percent conservation over $overlapwindow bases)\n";
	    print LOGFILE "Right overhang score: $overhangRightScore ($reducedcutoff percent conservation over $overlapwindow bases)\n";

	    if ($overhangLeftScore > $overhangRightScore){
		$transition = 1;
	    }
	    else {
		$transition = 2;
	    }	    
	}
	else {
	    $transition = 0;
	}
    }

    # do the actual merging
    $leftBoundary = $matchBegin[$j] - $offset[$j];
    $rightBoundary = $matchEnd[$j] - $offset[$j];
    `$lagandir/utils/cextract $filenames[$j].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$j] > $newdir/seq4`;

    if ($transition == 0){
	`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq4 $newdir/seq1 > $ARGV[3].new`;
	@temp = `$lagandir/utils/getlength $ARGV[3].new`; chomp $temp[0];
	$contigStop = $temp[0];
	$endChop = 0;
	`$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;
    }
    elsif ($transition == 1){
	`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq4 $newdir/seq1 > $ARGV[3].new`;
	@temp = `$lagandir/utils/getlength $ARGV[3].new`; chomp $temp[0];
	$contigStop = $temp[0];
	$endChop = 0;
    }
    elsif ($transition == 2){
	`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq4 > $ARGV[3].new`;
	@temp = `$lagandir/utils/getlength $ARGV[3].new`; chomp $temp[0];
	$contigStop = $temp[0];
	$endChop = $leftBoundaryLength;
    }
    $secTo = $alignLength[$j] - $endChop;

    if (index ($filenames[$j], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }
  
    @temp = `head $filenames[$j]`;
    chomp $temp[0]; $temp[0] = substr $temp[0], 1;
    print INFOFILE "$temp[0]\n";
    print INFOFILE "$num[$j] $matchBegin[$j] $matchEnd[$j] $contigStart $contigStop $startChop $endChop $direction $score[$j] $secFrom $secTo\n";

    if ($transition == 0){
	@temp = `$lagandir/utils/getlength $ARGV[3]`; chomp $temp[0];
	$contigStart = $temp[0] + 1;
	$startChop = 0;
	`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq2 > $ARGV[3].new`;
	`mv $ARGV[3].new $ARGV[3]`;
    }
    elsif ($transition == 1){
	@temp = `$lagandir/utils/getlength $ARGV[3].new`; chomp $temp[0];
	$contigStart = $temp[0] + 1;
	$startChop = $rightBoundaryLength;
	`mv $ARGV[3].new $ARGV[3]`;
    }
    elsif ($transition == 2){
	@temp = `$lagandir/utils/getlength $ARGV[3].new`; chomp $temp[0];
	$contigStart = $temp[0] + 1;
	$startChop = 0;
	`$lagandir/utils/seqmerge $ARGV[3].new $newdir/seq2 > $ARGV[3]`;
    }
    $secFrom = 1 + $startChop;

    $j = $i;
}

$leftBoundary = $matchBegin[$numContigs - 1] - $offset[$numContigs - 1];
$rightBoundary = $alignEnd[$numContigs - 1] - $offset[$numContigs - 1];
`$lagandir/utils/cextract $filenames[$numContigs - 1].mfa.new $leftBoundary $rightBoundary 0 1 -subst $filenames[$numContigs - 1] > $newdir/seq1`;
`$lagandir/utils/seqmerge $ARGV[3] $newdir/seq1 > $ARGV[3].new`;
`mv $ARGV[3].new $ARGV[3]`;

@temp = `$lagandir/utils/getlength $ARGV[3]`; chomp $temp[0];
$contigStop = $temp[0];
$endChop = 0;
$secTo = $alignLength[$j] - $endChop;

`mv $ARGV[3] $ARGV[3].new`;
`$lagandir/utils/seqmerge $ARGV[3].new $newdir/padding > $ARGV[3]`;

if (index ($filenames[$numContigs - 1], ".rc") == -1) { $direction = "+"; } else { $direction = "-"; }

@temp = `head $filenames[$numContigs - 1]`;
chomp $temp[0]; $temp[0] = substr $temp[0], 1;
print INFOFILE "$temp[0]\n";
print INFOFILE "$num[$numContigs - 1] $matchBegin[$numContigs - 1] $matchEnd[$numContigs - 1] $contigStart $contigStop $startChop $endChop $direction $score[$numContigs - 1] $secFrom $secTo\n";


print STDERR "Merging complete!\n\n";
print LOGFILE "Merging complete!\n\n";

`rm $newdir/seq1` if (-e "$newdir/seq1");
`rm $newdir/seq2` if (-e "$newdir/seq2");
`rm $newdir/seq3` if (-e "$newdir/seq3");
`rm $newdir/seq4` if (-e "$newdir/seq4");
`rm $newdir/aligned.mfa` if (-e "$newdir/aligned.mfa");
`rm $newdir/padding`;

close (LOGFILE);
close (INFOFILE);
