# Copyright 2006 Chien-I Liao
# All Rights Reserved.
#
# Author: Chien-I Liao (cil217@nyu.edu)
# Version 1.0
#
# Goal: combine different gene trees obtained from reliable evidence into
# the Graph of Life (an extension of phylogenetic tree)
#
# Basic introduction:
# By examining corresponding gene varients (orthologs) in different species,
# we may infer the history of evolution. The most common way to demonstrate
# the result is to use the notation of the phylogenetic tree: a rooted,
# leaf-labeled tree. Each leaf has a label representing the name of current
# species. For example:
#
#                      (root)
#                       /\
#                     1/  \
#                     /\  sharks
#                   2/  \
#                   /\  frogs
#                 3/  \
#                 /\  birds
#               4/  \
#               /\  dogs
#              /  \
#         humans  monkeys
#
#   In phylogenetics, trees are usually represented by well parenthesized
# strings which is also known as the Newick tree format. Therefore the above
# tree will be recorded as:
#     (((((humans,monkeys),dogs),birds),frogs),sharks)
#   Generally, phylogentic trees are unordered. Meaning, the order between
# siblings is unimportant. So we can feel free to rewrite the above tree to:
#     (sharks,((birds,((monkeys,humans),dogs)),frogs))
#   All internal nodes (those numbered in above tree) are hypothetically
# extinct species, which is a common assumption in phylogenetics. For example,
# we assume we will not find the common ancestor of humans and monkeys
# (species 4) because we believe it is extinct.
#   Another interpretation is through the relationship between species. For
# the example above, we can say humans and monkeys have a closer relationship
# than humans and frogs. Similarly, sharks and dogs have a farther
# relationship than birds and dogs. In fact, gene analysis usually give
# us only relationships between species and trees are reconstructed
# accordingly. There are many methods and software available for this purpose,
# but that is independent to our problem. Here we assume trees are already
# generated. The problem is that we might get different trees by analyzing
# different genes. Suppose we get the following two trees:
#     (((((humans,monkeys),dogs),birds),frogs),sharks)
#     ((((humans,(monkeys,dogs)),birds),frogs),sharks)
# There are two things we can do:
# 1. Throw one of them and continue believing that the history of evolution
#    can be modeled as a tree.
# 2. Drop the assumption of "tree of life" and merge two trees into a graph.
#   We take the second strategy and the merged graph will be:
#
#                      (root)
#                       /\
#                     1/  \
#                     /\  sharks
#                   2/  \
#                   /\  frogs
#                 3/  \
#                 / \  birds
#               4/   \5
#               /\  / \
#              /  \/   \
#        humans monkeys dogs
#
#   This is not a new idea in biology. In this case monkey is said to be
# a hybrid species or a species from an ancient interbreeding event. Of
# course, these events should be rare compared to gene mutations, which cause
# branching in phylogenetic trees. So our goal is to combine all trees into
# a graph with the fewest interbreeding events.
#
# Input:
#
#   Each gene suggests one or more possible phylogenetic trees in the
# Newick tree format (alphabets stand for species names):
#
#         ((F (A B)) ((E C) D)) [gene_name; score]
#                                  < optional >
#
#   Gene names and scores(ranks) are optional. It is required if multiple
# trees are suggested by the same gene. (Grouped by the name of gene)
# Scores are indicators for tree selection. Lower scores means stronger
# support of evidence and should be preferred during selection. If your
# data has opposite meaning, plese set flag -r/--reverse_score in input
# command line.
#
# Sample input:
# (((((((Scer Spar) Smik) Skud) Sbay) Scas) Sklu) Calb) [g1;17]
# (((((((Scer Spar) Smik) Skud) Sbay) Sklu) Scas) Calb) [g1;29]
# ((((((Scer Spar) (Skud Sbay)) Smik) Scas) Sklu) Calb) [g1;34]
# (((((Scer Spar) (Smik (Skud Sbay))) Scas) Sklu) Calb) [g2;62]
# (((((Scer Spar) (Skud Sbay)) Smik) (Scas Sklu)) Calb) [g2;34]
# (((((Scer Spar) Smik) Skud Sbay) (Scas Sklu)) Calb) [g2;42]
# (((((Scer Spar) (Skud Sbay)) Smik) Scas) Calb) [g3;134]
# ((((Scer Spar) (Smik Skud Sbay)) Scas) Calb) [g3;223]
# (((((Smik Skud) Scer Spar Sbay) Scas) Sklu) Calb) [g4;50]
#
# Output:
#   The output graph will be presented in the DOT language. To transform
# it into a visualized graph, on Unix please set -d 0 or --debug_level=0
# and save the output to filename.dot. A typical command:
#     python graph_of_life.py -d 0 {other flags} > filename.dot
# Then you can run either one of the following commands for outputing a .ps
# or a .jpg file:
#     dot -Tps filename.dot -o filename.ps
#     dot -Tjpg filename.jpg -o filename.jpg
# For other operating systems, please visit http://www.graphviz.org/ for
# software downloads and documentation.
#
# TODO: Improve missing orthologs placement

import sys
import os
import copy
import getopt
import math
import random
import re
import string
import time

# Set random seed
random.seed()

# Command Line Flags
ANNEALING_METHOD          = False # Default using consensus method
DEBUG_LEVEL               = 1     # Normal/Debug output
DELIMITER                 = ' '   # Delimiter in the Newick tree format
MAXIMUM_SELECTING_TIME    = -1    # Maximum seconds allowed to select the
                                  # trees.  -1: no time limit
MAXIMUM_COMBINING_TIME    = -1    # Maximum seconds allowed to combine the
                                  # trees.  -1: no time limit
MAX_RESTART_ITERATION     = 30    # Iteration number of restarting annealing
                                  # or consensus method
MAX_HITTING_SET_ITERATION = 30    # Iteration number for hitting set solver
MAX_EVALUATING_ITERATION  = 30    # Iteration number for graph evaluation
MAX_GRAPH_ITERATION       = 100   # Iteration number of building graph

# Consensus Method parameters
FREQUENCY_THRESHOLD       = 0

# Simulated Annealing parameters
INITIAL_TEMPERATURE       = 1.0
FINAL_TEMPERATURE         = 0.01
ALPHA                     = 0.8   # Cool down factor in annealing

# Flags for tree scores (used only in simulated annealing process)
# Used to compute the evaluation score, should be greater than the maximum
# score minus the minimum score of input trees times the number of genes
MULTIPLIER                = 10000.0
REVERSE_SCORE             = -1    # Higher score means higher confidence?
                                  # (default: NO)

# Flags to determine if tree selection is necessary
SINGLE_TREE_CASE          = True
MISSING_ORTHOLOG          = False

# Global variables

gene_evidence_list = [] # A list of gene evidences, filled according to input
species_name_list = []  # A list of species names, filled according to input
gene_name_list = []     # A list of gene families, filled according to input
miss_index = 0          # For indexing internal(extinct) species with name
                        # 'miss' + str(miss_index)

# Class definition

# For each gene, the corresponding Gene_Evidence is the collection of
# all possible phylogenetic trees supported by it.
# name: the name of the gene
# trees: a collection of Gene_Tree, in Newick tree format

class Gene_Evidence:
  def __init__(self, name):
    self.name = name
    self.trees = []
    return
  def __repr__(self):
    return "Gene %s : \n%r" % (self.name, self.trees)

# The tree structure supported by some gene evidence
# tree: the Newick tree format of the tree
# score(rank): indicator for tree selection. Trees with low scores are
# preferred
# Example:
# (((((((Scer Spar) Smik) Skud) Sbay) Scas) Sklu) Calb) [g1;17]
# [                    tree                           ] [name;score]

class Gene_Tree:
  def __init__(self, name, tree, score):
    self.name = name
    self.tree = tree
    self.score = score
  def __repr__(self):
    return " %f %s" % (self.score, self.tree)
  def Copy(self):
    return Gene_Tree(self.name, self.tree[:], self.score)

# A scenario: pick one tree for each gene and evaluate it by
# the total score and the conflicts within these trees
#
# selected_trees: a list of seleced trees, exactly one from each gene
# species_list: store all species with their orthologs
# conflicts: the minimal hybrid species set of this scenario.
# score: the evaluation of this scenario
#
# Fill_Orthologs(): transform trees to subscript representation of orthologs
#   Requirement: (1) Must obtain the whole species list before executing
#                (2) string.printable was used to encode the subscript,
#                    which has only 100 choices. Must be careful not to
#                    exceed this limit
#   Example: gx: ((S1 S2) S3 (S4 (S5 S6))) -->
#            S1: gx_11; S2: gx_12; S3: gx_2; S4: gx_31; S5: gx_321; S6: gx_322
#   Sometimes this gene would be missing in some species, if we have two trees
#            gx: ((S1 S2) S3 (S4 (S5 S6)))
#            gy: (((S1 S7) S3) (S4 S5) (S6 S2)))
#   Then we know S7 does not have gene gx. My algorithm, however, requires
#   full orthologs in every species. Therefore, in above example we have to
#   create a hypothetical gx ortholog for S7. In this program, we insert S7
#   to gx tree before one of the right parenthesis randomly. So S7 might
#   contais gx_13, gx_323, gx_33 or gx_4
#   TODO: This is definitely not good, should get some smart way to insert
#         missing orthologs
#
# Compute_Evaluation_Score(): compute the score of this scenario
#   Requirement: Fill_Orthologs() must be called first
#
# Evaluation(): return score
#
# Mutation(): return a NEW Scenario object. (Does not change the original)
#             With probability 1-(1/n) the selected_trees[i] would be the
#             same with original scenario. Otherwise randomly select
#             a tree in that gene family and insert it to the new scenario.
#             Here n equals to the number of trees so the expected number of
#             different trees in two scenarios is 1.

class Scenario:
  def __init__(self, selected_trees):
    self.selected_trees = selected_trees[:]
    self.species_list = []
    self.conflicts = []
    self.score = 0
  def __repr__(self):
    s = ""
    for i in self.selected_trees:
      s = "%s%s: %s\n" % (s, i.name, i.tree)
    s += str(self.score)
    return s
  def Copy(self):
    new_scene = Scenario(self.selected_trees)
    new_scene.species_list = [s.Copy() for s in self.species_list]
    new_scene.conflicts = self.conflicts[:]
    new_scene.score = self.score
    return new_scene
  def Fill_Orthologs(self):
    global DELIMITER, MULTIPLIER, MISSING_ORTHOLOG
    if species_name_list == []:
      raise Exception, "Global variable species_name_list not initialized"
    for name in species_name_list:
      tmp_species = Species(name)
      self.species_list.append(tmp_species)
    for t in self.selected_trees:
      gene_name = t.name
      tree = t.tree
      depth = 0
      max_depth = 0
      # Counter for the subscripts, count[i] = j means this species
      # was under the jth subtree of depth i ancestor
      count = []
      filled = []      # Used to check missing orthologs in species
      tmp_species_name = ""
      # Store places where species could be inserted into the tree
      insert_point = []
      for i in tree:
        if i == "(":
          depth += 1
          if depth > max_depth:
            max_depth = depth
            count.append(0)
        elif i == ")" or i in DELIMITER:
          if tmp_species_name != "": # Ignore empty species name
            tmp_subscript = ""
            for j in range(depth):
              if count[j] >= len(string.printable):
                raise ValueError, "No more characters could be used" + \
                                  "to represent the subscript..."
              tmp_subscript = "%s%s" % (tmp_subscript,
                                        string.printable[count[j]])
            tmp_ortholog = Ortholog(gene_name, tmp_subscript, True)
            species_found = Find_Name_Or_Die(self.species_list,
                                             tmp_species_name)
            species_found.orthologs.append(tmp_ortholog)
            filled.append(tmp_species_name)
            tmp_species_name=""
          if i in DELIMITER:
            count[depth-1] += 1
          if i == ")":
            tmp_subscript = ""
            for j in range(depth-1):
              if count[j] >= len(string.printable):
                raise ValueError, "No more characters could be used" + \
                                  "to represent the subscript..."
              tmp_subscript = "%s%s" % (tmp_subscript,
                                        string.printable[count[j]])
            if count[depth-1] + 1 >= len(string.printable):
              raise ValueError, "No more characters could be used" + \
                                "to represent the subscript..."
            tmp_subscript += string.printable[count[depth-1] + 1]
            insert_point.append(tmp_subscript)
            depth -= 1
            count[depth] = 0
        else:
          tmp_species_name = "%s%s" % (tmp_species_name, i)
      # Fill in missing orthologs for all species
      for s in self.species_list:
        if s.name not in filled:  # This species has missing ortholog
          MISSING_ORTHOLOG = True
          random_index = random.choice(range(len(insert_point)))
          random_subscript = insert_point[random_index]
          tmp_ortholog = Ortholog(gene_name, random_subscript, False)
          s.orthologs.append(tmp_ortholog)
          if random_subscript[-1] == string.printable[-1]:
            raise ValueError, "No more characters could be used" + \
                              "to represent the subscript..."
          insert_point[random_index] = random_subscript[:-1] + \
              string.printable[string.printable.index(random_subscript[-1])+1]
    if DEBUG_LEVEL > 3:
      for s in self.species_list:
        print "%s %r" % (s.name, s.orthologs)

  def Compute_Evaluation_Score(self):
    global MAX_EVALUATING_ITERATION, MULTIPLIER
    if self.species_list == []:
      raise Exception, "Must fill orthologs before computing the score"
#    self.score = 0
    best_graph = Reconstruct_Graph_Of_Life(self, [t.name
      for t in self.selected_trees])
    (best_score, species) = best_graph.Evaluation()
    for loop in range(MAX_EVALUATING_ITERATION):
      tmp_graph = Reconstruct_Graph_Of_Life(self, [t.name
          for t in self.selected_trees])
      (events, species) = tmp_graph.Evaluation()
      if events < best_score:
        best_score = events
    self.score = best_score * MULTIPLIER
#    for i in range(len(self.selected_trees)):
#      for t in self.selected_trees[i+1:]:
#        self.score += Pairwise_Similarity(self.species_list,
#            [self.selected_trees[i].name, t.name])
#    self.score *= MULTIPLIER
    for i in self.selected_trees:
      self.score -= i.score

  def Evaluation(self):
    return self.score

  def Mutation(self):
    mutated_scene = Scenario([])
    number_of_gene = len(gene_name_list)
    for t in self.selected_trees:
      coin_value = random.randint(0, number_of_gene-1)
      # Mutate it on a flip of 0, i.e. with probablility 1/n
      if coin_value == 0:
        mutated_gene = Find_Name_Or_Die(gene_evidence_list, t.name)
        mutated_tree = random.choice(mutated_gene.trees).Copy()
        mutated_scene.selected_trees.append(mutated_tree)
      else:
        mutated_scene.selected_trees.append(t.Copy())
    return mutated_scene

# Represents current species
# name: the name of the species
# orthologs: all orthologs carried by this species
#
# Add_Ortholog(ortholog): append the ortholog to ortholog list
# Get_Subscript(gene_name): return the corresponding subscript of gene with
#   name gene_name. Raise error if not found

class Species:
  def __init__(self, name):
    self.name = name
    self.orthologs = []          # List of Ortholog
    self.children = []
    self.edges_from_parents = []
    return
  def __repr__(self):
    if DEBUG_LEVEL > 3:
      return "<Species %s: ortholog list = %r>\n" % (self.name, self.orthologs)
    else:
      return "%s " % self.name
  # Append a Gene to ortholog list
  def Add_Ortholog(self, ortholog1):
    self.orthologs.append(ortholog1)
    return
  def Get_Subscript(self, gene_name):
    ortholog = Find_Name_Or_Die(self.orthologs, gene_name)
    return ortholog.subscript
  def Copy(self):
    new_species = Species(self.name)
    new_species.orthologs = [o.Copy() for o in self.orthologs]
    return new_species

# The varient of a gene in a species
# name: the name of the gene family corresponding to this ortholog
# subscript: the subscript
# exists: True if this ortholog originally in this species.
#   False if it was not but we later add it to the species to complete
#   the ortholog list. (critical for my algorithm)
#
# Subscript_Repr(): return subscript representation of an ortholog

class Ortholog:
  def __init__(self, name, subscript, exists):
    self.name = name
    self.subscript = subscript
    self.exists = exists
    return
  def __repr__(self):
    if self.exists:
      return self.Subscript_Repr()
    else:
      return "*"+self.Subscript_Repr()
  def Subscript_Repr(self):
    return "%s_%s" % (self.name, self.subscript)
  def Copy(self):
    return Ortholog(self.name, self.subscript[:], self.exists)

# Class definitions for the "Graph of Life":

# A directed edge in Graph of Life represents a parent-child relationship
# fromnode: the parent species
# tonode: the species which inherites some genes from the parent species
# edgetype: PLAIN if direct inheritence (only one parent)
#           DOTTED if multiple inheritence
# percentage: the proportion of orthologs contributed by this parent
#             (always 1.0 if direct inheritence)

class Edge:
  def __init__(self, parent_species, child_species, carried_orthologs):
    self.fromnode = parent_species
    self.tonode = child_species
    self.carried_orthologs = carried_orthologs
    partial_orthologs = 0
    for name in self.carried_orthologs:
      o = Find_Name(self.tonode.orthologs, name)
      if o.exists:
        partial_orthologs += 1
    genuine_orthologs = len([o for o in self.tonode.orthologs if o.exists])
    if genuine_orthologs == partial_orthologs:
      self.full_edge = True
    else:
      self.full_edge = False
    self.fraction = float(partial_orthologs) / float(genuine_orthologs)
  def __repr__(self):
    if self.full_edge:
      return "%s -> %s;\n" % (self.fromnode, self.tonode)
    else:
      return "%s -> %s [headlabel=%.3f, style=dotted];\n" \
              % (self.fromnode, self.tonode, self.fraction)

def Edge_Comparison(e1, e2):
  if e1.full_edge:
    if not e2.full_edge:
      return -1
    else:
      index1 = int(e1.fromnode.name[4:])
      index2 = int(e2.fromnode.name[4:])
      if index1 > index2:
        return 1
      if index1 < index2:
        return -1
      return 0
  else:
    if e2.full_edge:
      return 1
    else:
      if e1.tonode.name > e2.tonode.name:
        return 1
      if e1.tonode.name < e2.tonode.name:
        return -1
      index1 = int(e1.fromnode.name[4:])
      index2 = int(e2.fromnode.name[4:])
      if index1 > index2:
        return 1
      if index1 < index2:
        return -1
      return 0

# The phylogenetic graph structure
# V: the set of vertices (in fact, Species objects in this application)
# E: the set of edges
#
# Add_Vertex(species): add species to V
# Add_Edge(edge): add edge to E, raise error if endpoints of edge not in V
#   Side Effect: might increase global counter miss_index
# Evaluation(): return the number of interbreeding events and interbreeded
#   species. The fewer, the better in our application

class Graph:
  def __init__(self, V, E):
    self.V = V[:]
    self.E = E[:]
    return
  def __repr__(self):
    output = 'digraph G {\n  size = "7.5, 10" ;\n'
    for i in self.E:
      output = "%s  %s" % (output, i.__repr__())
    output = "%s}\n" % output
    return output
  def Add_Vertex(self, species):
    self.V.append(species)
  def Add_Edge(self, edge):
    if edge.fromnode not in self.V or edge.tonode not in self.V:
      print self.V
      print edge.fromnode
      print edge.tonode
      raise ValueError, \
          "Can't add an edge while one of the endpoint not in vertex set"
    self.E.append(edge)
    if edge.tonode not in edge.fromnode.children:
      edge.fromnode.children.append(edge.tonode)
    found = False
    for e in edge.tonode.edges_from_parents:
      if edge.fromnode == e.fromnode:
        found = True
        e.carried_orthologs.extend(edge.carried_orthologs)
        break
    if not found:   # New parent
      edge.tonode.edges_from_parents.append(edge)

  def Add_Path(self, from_species, to_species, carried_orthologs):
    global miss_index
    if DEBUG_LEVEL > 2:
      print "Add path from %s to %s" % (from_species.name, to_species.name)
      print "Ortholog list for ancestor: %r" % from_species.orthologs
      print "Ortholog list for decendent: %r" % to_species.orthologs
    derived_gene = []
    if DEBUG_LEVEL > 3:
      print "Carried Orthologs : %r" % carried_orthologs
    for name in carried_orthologs:
      subscript1 = from_species.Get_Subscript(name)
      subscript2 = to_species.Get_Subscript(name)
      for i in range(len(subscript2) - len(subscript1)):
        derived_gene.append(name)
    if len(derived_gene) < 2:
      self.Add_Edge(Edge(from_species, to_species, carried_orthologs))
      return
    internal_species = from_species.Copy()
    internal_species.name = "miss" + str(miss_index)
    miss_index += 1
    for name in carried_orthologs:
      o = Find_Name(internal_species.orthologs, name)
      o.subscript = to_species.Get_Subscript(name)
    if DEBUG_LEVEL > 3:
      print "Derived Gene : %r" % derived_gene
    selected_gene = random.choice(derived_gene)
    derived_gene.remove(selected_gene)
    o = Find_Name(internal_species.orthologs, selected_gene)
    o.subscript = o.subscript[:-1]
    current_species = internal_species
    if DEBUG_LEVEL > 3:
      print "Add species %s %r" % (internal_species.name,
                                   internal_species.orthologs)
    self.Add_Vertex(internal_species)
    self.Add_Edge(Edge(current_species, to_species, carried_orthologs))
    gene_names = [o.name for o in from_species.orthologs]
    while len(derived_gene) > 1:
      selected_gene = random.choice(derived_gene)
      derived_gene.remove(selected_gene)
      # Generate direct parent to the current_species
      internal_species = current_species.Copy()
      internal_species.name = "miss" + str(miss_index)
      miss_index += 1
      # Get subscript one step near to the from_species
      o = Find_Name(internal_species.orthologs, selected_gene)
      o.subscript = o.subscript[:-1]
      if DEBUG_LEVEL > 3:
        print "Add species %s %r" % (internal_species.name,
                                     internal_species.orthologs)
      self.Add_Vertex(internal_species)
      self.Add_Edge(Edge(internal_species, current_species, gene_names))
      current_species = internal_species
    self.Add_Edge(Edge(from_species, current_species, gene_names))

  def Evaluation(self):
    number_of_interbreeding_events = 0.0
    number_of_interbreeding_species = 0.0
    for e in self.E:
      if not e.full_edge:
        number_of_interbreeding_events += 1.0
        number_of_interbreeding_species += e.fraction
    number_of_interbreeding_events -= number_of_interbreeding_species
    return number_of_interbreeding_events, number_of_interbreeding_species

# Other class definitions:

# This queue did not remove popped object, the start index was incremented
# instead.
# list: the queue was implemented by list
# start: index of queue head
# end: index of queue end
# max_length: maximum length allowed
#
# Initialization: takes in two arguments list1, max_length. list1 was
#   some dummy objects that will never be popped, used only by GetAll function
# Put(item): push item into queue, increment the end index
# Get(item): get item from queue, increment the start index
# Length(): get the number of active elements in queue = end - start
# GetAll(): return a copy of ALL objects in queue, including both active
#   and inactive elements

class Queue:
  def __init__(self, list1, max_length):
    self.list = list1[:]
    self.start = len(list1)
    self.end = len(list1)
    self.max_length = max_length
  def __repr__(self):
    return "%r" % self.list[start:end]
  def Put(self, item):
    if item not in self.list:
      self.list.append(item)
      self.end += 1
      if self.end == self.max_length:
        return False            # Queue Full
    return True
  def Get(self):
    if self.start == self.end:
      raise ValueError, "Tried to get item from empty queue!"
    item = self.list[self.start]
    self.start += 1
    return item
  def Length(self):
    return self.end - self.start
  def GetAll(self):
    return self.list[:]

# Basic routins

# Determine if there is an entry with the given name in a list
# Input: List l to be searched, objects in l must contain data member name.
#   The second argument would be a String object name to be found
# Output: The entry with that name in the list, or False if not found

def Find_Name(l, name):
  for i in l:
    if i.name == name:
      return i
  return False

# Same as Fine_Name, but raise exception when not found

def Find_Name_Or_Die(l, name):
  found = Find_Name(l, name)
  if not found:
    raise ValueError, "Name " + name + " not found in list"
  return found

# Testing whether a string s is well parenthesized
# Input: String s
# Output: True if s is well parenthesized

def Is_Well_Parenthesized(s):
  level = 0
  for i in s:
    if i == "(":
      level = level + 1
    elif i == ")":
      level = level - 1
      if level < 0:   # more ")" then "(" in some stage
        return False
  if level > 0:       # more "(" then ")"
    return False
  return True

# Return a random index of element E in list L if E appears multiple times
def Random_Index_Of(E, L):
  if E not in L:
    return -1
  ind = [i for i in range(len(L)) if L[i] == E]
  return ind[random.randint(0, len(ind)-1)]

# Check if A is a proper prefix of B, return False if not (including the
# case A=B)
def Is_Prefix(A, B):
  if len(A) >= len(B):
    return False
  if string.find(B,A) == 0:
    return True
  return False

# Find the maximum common prefix of two string
def Common_Prefix(str1, str2):
  l = min(len(str1), len(str2))
  prefix_length = 0
  for i in range(l):
    if str1[i] == str2[i]:
      prefix_length = prefix_length + 1
    else:
      break
  return str1[0:prefix_length]

# Return the intersection of two lists
def List_Intersection(list1, list2):
  return [i for i in list1 if i in list2]

# Return the union of two lists
def List_Union(list1, list2):
  union = list1[:]
  for i in list2:
    if i not in union:
      union.append(i)
  return union

# For three vertices in a tree, there are only 4 cases for their relationship:
# 0: if nearest common ancestor of (a,b)=(a,c)=(c,a)
# 1: if nearest common ancestor of (a,b) is a descendent of that of (a,c)=(b,c)
# 2: if nearest common ancestor of (b,c) is a descendent of that of (a,b)=(a,c)
# 3: if nearest common ancestor of (a,c) is a descendent of that of (a,b)=(b,c)

def Test_Relation(a, b, c):
  len1 = len(Common_Prefix(a,b))
  len2 = len(Common_Prefix(b,c))
  len3 = len(Common_Prefix(a,c))
  if len1 == len2 and len2 == len3:
    return 0
  if len1 == len2:
    return 3
  if len2 == len3:
    return 1
  if len3 == len1:
    return 2

def Count_Interbreeding_Events_Needed(species_list, gene_names):
  conflict_triples = []
  for i in range(len(species_list)):
    s1 = species_list[i]
    for j in range(i+1, len(species_list)):
      s2 = species_list[j]
      for s3 in species_list[j+1:]:
        relationship = 0
        for gene_name in gene_names:
          tmp_relationship = Test_Relation(s1.Get_Subscript(gene_name),
                                           s2.Get_Subscript(gene_name),
                                           s3.Get_Subscript(gene_name))
          if tmp_relationship != 0:
            if relationship == 0:
              relationship = tmp_relationship
            elif tmp_relationship != relationship:
              conflict_triples.append([species_list.index(s1),
                                       species_list.index(s2),
                                       species_list.index(s3)])
              break
  events_needed = Hitting_Set_Solver(conflict_triples,
                                     range(len(species_list)))
  return events_needed

def Find_Common_Blocks(input_species_list, gene_names):
  species_list = []
  subscript_list = [[] for g in gene_names]
  potential_blocks = []
  for s in input_species_list:
    to_add = False
    for g in gene_names:
      ortholog = Find_Name_Or_Die(s.orthologs, g)
      if ortholog.exists:
        to_add = True
        break
    if to_add:
      species_list.append(s)
      for i in range(len(gene_names)):
        subscript_list[i].append(s.Get_Subscript(gene_names[i]))
  for i in range(len(species_list)):
    s1 = species_list[i]
    for s2 in species_list[i+1:]:
      q = Queue([s1], len(species_list))
      q.Put(s2)
      while q.Length() > 0:
        s = q.Get()
        j = species_list.index(s)
        common_length = []
        for k in range(len(gene_names)):
          common_length = len(Common_Prefix(subscript_list[k][i],
                                            subscript_list[k][j]))
          for l in range(len(subscript_list[k])):
            if len(Common_Prefix(subscript_list[k][i],
                                subscript_list[k][l])) > common_length or \
               len(Common_Prefix(subscript_list[k][j],
                                 subscript_list[k][l])) > common_length :
              if not q.Put(species_list[l]):
                break   # Queue full, which means the block already included
                        # all species. Terminate the loop.
      block = q.GetAll()
      if len(block) != len(species_list):
        repeated = False
        for b in potential_blocks:
          if len(b) == len(block) and \
             len(List_Intersection(b, block)) == len(b):
            repeated = True
            break
        if not repeated:
          potential_blocks.append(block)
  potential_blocks.append(species_list[:]) # Add trivial block: all species
  return potential_blocks, species_list, subscript_list

# Rather than the size, the minimum here means the inner-most term
# Example: (A ((B C) D) E) and (A (B (C D)) E)
#          potential_blocks = {AE, ABCD, BCD, BCDE, ABCDE}
#          BCD would be the min_block since it is either a
#          subset of other blocks or disjoint with them
# The only exception that no min-block is available is
# every non-trivial block has size 2, like
# [(A B) (B C) (C A) (A B C D E)], then we can take any
# non-trivial block as starting point. In this program,
# we take the first block by default, that is, set min_block = (A B)

def Find_Min_Block(blocks):
  min_block = blocks[0]  # Set default min_block
  for b1 in blocks:      # See if any block is a proper subset
    is_min = True        # of any other block
    for b2 in blocks:
      if len(b1) == 2 and len(b2) == 2:
        continue
      l = len(List_Intersection(b1, b2))
      if l != len(b1) and l != 0:
        is_min = False
        break
    if is_min:
      min_block = b1
      break
  # min_block may be changed later, so return a copy of it
  return min_block[:]

def Refine_Blocks(blocks, min_block):
  new_min_block = min_block[:]
  new_blocks = []
  for b in blocks:
    if len(b) == 2:
      if b[0] in new_min_block and b[1] not in new_min_block:
        new_min_block.append(b[1])
      elif b[1] in new_min_block and b[0] not in new_min_block:
        new_min_block.append(b[0])
  for b in blocks:
    if len(b) != 2 or b[0] not in new_min_block:
      new_blocks.append(b)
  new_blocks.append(new_min_block)
  return new_blocks, new_min_block[:]

def Construct_Parent_Species(child_list, gene_names):
  global miss_index
  global DEBUG_LEVEL
  try:
    if len(child_list)==0:
      raise ValueError
  except ValueError:
    print "Tried to construct parent species with null child list"
  parent_species = Species("miss" + str(miss_index))
  orthologs = []
  exists = []
  for name in gene_names:
    orthologs.append(child_list[0].Get_Subscript(name))
    exists.append(Find_Name(child_list[0].orthologs, name).exists)
  for i in range(len(gene_names)):
    for s in child_list[1:]:
      subscript = s.Get_Subscript(gene_names[i])
      orthologs[i] = Common_Prefix(orthologs[i], subscript)
      if(Find_Name(s.orthologs, gene_names[i]).exists):
        exists[i] = True
    parent_species.Add_Ortholog(Ortholog(gene_names[i],
                                orthologs[i],
                                exists[i]))
  return parent_species

def Pairwise_Similarity(input_species_list, gene_names):
  # This function will actually change species_list,
  # so we need a new copy of that.
  (potential_blocks, species_list, subscript_list) = \
    Find_Common_Blocks(input_species_list, gene_names)
  tmp_index = 0
  interbreeding_events_needed = 0
  while len(species_list) > 1:
    # First, we are going to determine the min_block.
    min_block = Find_Min_Block(potential_blocks)
    # This is the only special case that more than one block could join
    # together. For example: [s1 s2] [s1 s3] [s2 s3] => [s1 s2 s3]
    # Also in this case, no interbreeding event was needed
    if len(min_block) == 2:
      (potential_blocks, min_block) = Refine_Blocks(potential_blocks,
                                                    min_block)
    else:
      interbreeding_events_needed = interbreeding_events_needed + \
                                    len(Count_Interbreeding_Events_Needed
                                        (min_block, gene_names))
    tmp_species = Species("miss" + str(tmp_index))
    tmp_index = tmp_index + 1
    orthologs = []
    for name in gene_names:
      orthologs.append(min_block[0].Get_Subscript(name))
    for i in range(len(species_list)):
      if species_list[i] in min_block:
        for j in range(len(gene_names)):
          orthologs[j] = Common_Prefix(orthologs[j], subscript_list[j][i])
    for i in range(len(gene_names)):
      tmp_species.Add_Ortholog(Ortholog(gene_names[i], orthologs[i], True))
    # Replace all species in min_block by new combined species
    species_list = [s for s in species_list if s not in min_block]
#    deleted = 0
#    for i in range(len(species_list)):
#      if species_list[i - deleted] in min_block:
#        species_list[i-deleted : i-deleted+1] = []
#        for subscript in subscript_list:
#          subscript[i-deleted : i-deleted+1] = []
#        deleted = deleted + 1
    species_list.append(tmp_species)
    for i in range(len(subscript_list)):
      subscript_list[i].append(orthologs[i])
    # Remove min_block, substitute it by new species in all other blocks
    potential_blocks.remove(min_block)
    for b in potential_blocks:
      try:
        if min_block[0] in b:
          for s in min_block:
            b.remove(s)
          b.append(tmp_species)
      except ValueError:
        print potential_blocks
        print b
        print min_block
        print s
        print input_species_list
        print species_list
        print subscript_list
        print "Min-block algorithm failed! (Pairwise Similarity)"
#  print interbreeding_events_needed
  return interbreeding_events_needed * interbreeding_events_needed

################
# Main Program #
################

# Hitting set problem solver

# Find an 3-approximation hitting set for T.
# C : Collection of size 3 subsets of element set K. This approach also work
#     for arbitrary subset collection C, but may not yield approximation
#     ratio 3 in that general case.
# K : A set of elements without duplicate
# hitting_set : A hitting set, i.e., a set of elements such that for
#               each S in C, S intersect hitting_set != void

def Three_Approximation_Hitting(C, K):
  hitting_set = []
  for i in C:
    if List_Intersection(hitting_set, i) == []:
      for j in i:
        hitting_set.append(j)
  return hitting_set

# Find an O(nlgn)-approximation hitting set for C
# C : Collection of subsets of element set K
# K : A set of elements without duplicate
# Greedily find an element which covers most subsets
# not yet covered till all subsets are hit

def Greedy_Approximation_Hitting(C, K):
  is_covered = [False for i in C]
  frequency = [0 for i in K]
  try:
    for i in C:
      for j in i:
        frequency[K.index(j)] = frequency[K.index(j)] + 1
    number_of_uncovered = len(C)
    hitting_set = []
    while number_of_uncovered > 0:
      to_add = Random_Index_Of(max(frequency), frequency)
      if to_add == -1:
        raise ValueError
      hitting_set.append(K[to_add])
      for i in range(len(C)):
        if is_covered[i] == False and K[to_add] in C[i]:
          is_covered[i] = True
          number_of_uncovered = number_of_uncovered - 1
          for j in C[i]:
            frequency[K.index(j)] = frequency[K.index(j)] - 1
  except ValueError:
    print "Bad hitting set problem. All elements in C must be a subset of K"
  return hitting_set

# A Monti-Carlo heuristic to solve general hitting set problem
# C : Collection of size 3 subsets of element set K
# K : A set of elements without duplicate

def Random_Hitting(C, K):
  hitting_set = K[:]
  Remove_Redundant(hitting_set, C, K)
  return hitting_set

def Hitting_Set_Solver(C, K):
  global MAX_HITTING_SET_ITERATION
  if C == []:
    return []
  if [] in C:
    print "Error: no way to hit an empty set in hitting set problems"
    sys.exit(3)
  min_hitting_set = Three_Approximation_Hitting(C, K)
  min_length = len(min_hitting_set)
  hitting_set = Greedy_Approximation_Hitting(C, K)
  if len(hitting_set) < min_length:
    min_length = len(hitting_set)
    min_hitting_set = hitting_set[:]
  for loop in range(MAX_HITTING_SET_ITERATION):
    hitting_set = Random_Hitting(C, K)
    if len(hitting_set) < min_length:
      min_length = len(hitting_set)
      min_hitting_set = hitting_set[:]
  return min_hitting_set

# To achieve a maximal (not maximum) cover, discard elements until
# any further removal results a non-hitting set.
# Input: the starting set, subset collection C and the base element set K
# Output: Delete element from hitting_set till it forms a maximal cover
def Remove_Redundant(hitting_set, C, K):
  Order = K[:]
  random.shuffle(Order)
  for i in Order:
    if i in hitting_set:
      hitting_set.remove(i)
      if Is_Hitting_Set(hitting_set, C, K) == False:
        hitting_set.append(i)
  return

# Test is hitting_set a hitting set of subset collection C
def Is_Hitting_Set(hitting_set, C, K):
  for i in C:
    got_hit = False
    for j in i:
      if j in hitting_set:
        got_hit = True
        break
    if got_hit == False:
      return False
  return True

# Functions in main program

# Function Decompress_Input:
# Read input lines and transform each line to a Gene_Tree object,
# then append it to proper Gene_Evidence object

def Decompress_Input(filename):
  global DELIMITER, REVERSE_SCORE, SINGLE_TREE_CASE
  anonymous_index = 0
  try:
    input_file = open(filename, "r")
    for line in input_file:
      if len(line) < 3:               # Blank line
        continue
      break_point1 = line.find("[")
      if break_point1 == -1:
        tree_part = line.strip()
        gene_name = "anonymous" + str(anonymous_index)
        anonymous_index += 1
        score = 0
      else:
        tree_part = line[0 : break_point1].strip()
        break_point2 = line.find(";")
        break_point3 = line.index("]")
        if break_point2 == -1:
          gene_name = line[break_point1 + 1 : break_point3]
          score = 0
        else:
          gene_name = line[break_point1 + 1 : break_point2]
          score = int(line[break_point2 + 1 : break_point3]) * REVERSE_SCORE
      first_left_parenthesis = tree_part.index("(")
      if first_left_parenthesis != 0 or tree_part[-1] != ")":
        print ("Warning: There are extra characters before the first left\n"
               "         parenthesis or after the last right parenthesis in\n"
               "         input trees. These characters will be\n"
               "         automatically removed.\n")
        last_right_parenthesis = tree_part.rindex(")")
        tree_part = tree_part[first_left_parenthesis :
                              last_right_parenthesis + 1]
      if not Is_Well_Parenthesized(tree_part):
        print "Input tree not well parenthesized!"
        raise ValueError
      delimiter_re = re.compile(r'[%s]' % DELIMITER)
      species_in_this_tree = delimiter_re.split(tree_part)
      for i in range(len(species_in_this_tree)):
        species_in_this_tree[i] = species_in_this_tree[i].strip("() ")
      for i in species_in_this_tree:
        if len(i) == 0:     # Empty string
          continue
        if i not in species_name_list:
          species_name_list.append(i)
      tmp_tree = Gene_Tree(gene_name, tree_part, score)
      tmp_gene = Find_Name(gene_evidence_list, gene_name)
      if tmp_gene:
        SINGLE_TREE_CASE = False
        tmp_gene.trees.append(tmp_tree)
      else:
        tmp_gene = Gene_Evidence(gene_name)
        tmp_gene.trees.append(tmp_tree)
        gene_evidence_list.append(tmp_gene)
      if gene_name not in gene_name_list:
        gene_name_list.append(gene_name)
  except IOError:
    print "Fail to open %s" % filename
    sys.exit(1)
  except ValueError:
    print line
    print "Incorrect input format"
    sys.exit(1)
  return

# Randomly pick a "skeleton" tree, for each gene, select a tree
# most mimic to the skeleton and the last selected tree (to add more
# accuracy)

def Choose_Starting_Scene():
  top_trees = []
  random_gene = random.choice(gene_evidence_list)
  random_tree = random.choice(random_gene.trees)
  top_trees.append(random_tree)
  last_tree = random_tree
  for g in gene_evidence_list:
    if g == random_gene:
      continue
    min_score = len(species_name_list)*len(species_name_list)*3
    if len(g.trees) == 1:
      top_trees.append(g.trees[0])
      last_tree = g.trees[0]
      continue
    for t in g.trees:
      test_scene = Scenario([random_tree, t])
      test_scene.Fill_Orthologs()
      score = Pairwise_Similarity(test_scene.species_list,
                                  [random_tree.name, t.name])
      test_scene = Scenario([last_tree, t])
      test_scene.Fill_Orthologs()
      score = score + Pairwise_Similarity(test_scene.species_list,
                                          [last_tree.name, t.name])
      if score < min_score:
        min_score = score
        tree_candidate = t
    top_trees.append(tree_candidate)
    last_tree = tree_candidate
  return top_trees

# Given the gene evidence setting, pick the best combination of trees,
# one for each gene family, to get as few conflicts as possible
# Input: none
# Output: the best scenario explored through out the process

def Simulated_Annealing():
  global DEBUG_LEVEL, INITIAL_TEMPERATURE, FINAL_TEMPERATURE
  global ALPHA, MULTIPLIER, SINGLE_TREE_CASE, MISSING_ORTHOLOG
  top_trees = Choose_Starting_Scene()
  top_scene = Scenario(top_trees) # Initialize the best scenario
  top_scene.Fill_Orthologs()
  top_scene.Compute_Evaluation_Score()
  if SINGLE_TREE_CASE and not MISSING_ORTHOLOG:
    if DEBUG_LEVEL > 0:
      print "Single tree per gene case, no need to run annealing"
    return top_scene
  best_score = top_scene.Evaluation()
  if DEBUG_LEVEL > 1:
    print "Starting scenario :\n"
    print top_scene
  current_scene = top_scene.Copy()
  temperature = INITIAL_TEMPERATURE
  while temperature > FINAL_TEMPERATURE:
    if DEBUG_LEVEL > 0:
      print "******* ANNEALING *******"
    new_scene = current_scene.Mutation()
    new_scene.Fill_Orthologs()
    new_scene.Compute_Evaluation_Score()
    if DEBUG_LEVEL > 0:
      print "Current score: %d" % current_scene.Evaluation()
      print "Neighbor's score: %d" % new_scene.Evaluation()
    improvement = float(current_scene.Evaluation() -
        new_scene.Evaluation()) / float(MULTIPLIER)
    if improvement > 0.0:
      if new_scene.Evaluation() < best_score :
        if DEBUG_LEVEL > 0:
          print "Switch to Neighbor!"
        best_score = new_scene.Evaluation()
        top_scene = new_scene.Copy()
      current_scene = new_scene
    elif random.random() < math.exp(improvement / temperature):
      if DEBUG_LEVEL > 0:
        print "Switch to Neighbor!"
      current_scene = new_scene
    temperature = temperature * ALPHA # Decreasing the temperature
  return top_scene

def Consensus_Method(start_time):
  global DEBUG_LEVEL, MAXIMUM_TIME, MAX_RESTART_ITERATION, FREQUENCY_THRESHOLD
  all_trees = []
  tree_frequency = []
  cover_list = []
  for evidence in gene_evidence_list:
    cover = []
    for t in evidence.trees:
      found = False
      for i in range(len(all_trees)):
        if t.tree == all_trees[i]:
          tree_frequency[i] += 1
          cover.append(i)
          found = True
      if not found:
        cover.append(len(tree_frequency))
        all_trees.append(t.tree)
        tree_frequency.append(1)
    cover_list.append(cover)
  filtered_list = [i for i in range(len(tree_frequency))
                     if tree_frequency[i] > FREQUENCY_THRESHOLD]
  if DEBUG_LEVEL > 0:
    print "Number of different trees in original data : %d" % len(all_trees)
    print "Number of trees dropped by frequency threshold : %d" % \
        (len(all_trees) - len(filtered_list))
  for i in range(len(cover_list)):
    cover_list[i] = List_Intersection(filtered_list, cover_list[i])
  # Ignore genes with eccentric trees
  if DEBUG_LEVEL > 0:
    print "Number of genes initially : %d" % len(cover_list)
  cover_list = [c for c in cover_list if c != []]
  if DEBUG_LEVEL > 0:
    print "Number of gene with at least one tree left : %d" % len(cover_list)
  hitting_set = Greedy_Approximation_Hitting(cover_list, filtered_list)
  top_trees = []
  tmp_index = 0
  for i in hitting_set:
    t = Gene_Tree("tmp" + str(tmp_index), all_trees[i], 0)
    top_trees.append(t)
    tmp_index += 1
    if DEBUG_LEVEL > 1:
      print all_trees[i]
  top_scene = Scenario(top_trees) # Initialize the best scenario
  top_scene.Fill_Orthologs()
  top_scene.Compute_Evaluation_Score()
  best_score = float(top_scene.Evaluation()) #/ float(len(hitting_set))
  if DEBUG_LEVEL > 0:
    print "Running greedy algorithm:"
    print "Number of tree selected: %d" % len(hitting_set)
    print "New scene score: %f" % best_score
  for loop in range(MAX_RESTART_ITERATION):
    hitting_set = Greedy_Approximation_Hitting(cover_list, filtered_list)
    new_trees = []
    tmp_index = 0
    for i in hitting_set:
      t = Gene_Tree("tmp" + str(tmp_index), all_trees[i], 0)
      new_trees.append(t)
      tmp_index += 1
      if DEBUG_LEVEL > 1:
        print all_trees[i]
    new_scene = Scenario(new_trees)
    new_scene.Fill_Orthologs()
    new_scene.Compute_Evaluation_Score()
    new_score = float(new_scene.Evaluation()) #/ float(len(hitting_set))
    if DEBUG_LEVEL > 0:
      print "***Iteration %d ***" % (loop + 1)
      print "Number of tree selected: %d" % len(hitting_set)
      print "New scene score: %f" % new_score
    if new_score < best_score:
      best_score = new_score
      top_scene = new_scene.Copy()
  if DEBUG_LEVEL > 0:
    print "Running randomized algorithm:"
  for loop in range(MAX_RESTART_ITERATION):
    hitting_set = Random_Hitting(cover_list, filtered_list)
    new_trees = []
    tmp_index = 0
    for i in hitting_set:
      t = Gene_Tree("tmp" + str(tmp_index), all_trees[i], 0)
      new_trees.append(t)
      tmp_index += 1
      if DEBUG_LEVEL > 1:
        print all_trees[i]
    new_scene = Scenario(new_trees)
    new_scene.Fill_Orthologs()
    new_scene.Compute_Evaluation_Score()
    new_score = float(new_scene.Evaluation()) #/ float(len(hitting_set))
    if DEBUG_LEVEL > 0:
      print "***Iteration %d ***" % (loop + 1)
      print "Number of tree selected: %d" % len(hitting_set)
      print "New scene score: %f" % new_score
    if new_score < best_score:
      best_score = new_score
      top_scene = new_scene.Copy()
  gene_names = ["tmp" + str(i) for i in range(len(top_scene.selected_trees))]
  return top_scene, gene_names

def Build_Subgraph(graph, ancestor_species, input_species_list, gene_names):
  global miss_index
  (potential_blocks, species_list, subscript_list) = \
      Find_Common_Blocks(input_species_list, gene_names)
  while len(species_list) > 1:
    min_block = Find_Min_Block(potential_blocks)
    if len(min_block) == 2:
      (potential_blocks, min_block) = Refine_Blocks(potential_blocks,
                                                    min_block)
    parent_species = Construct_Parent_Species(min_block, gene_names)
    miss_index += 1
    graph.Add_Vertex(parent_species)
    for child in min_block:
      graph.Add_Path(parent_species, child, gene_names)

    # Replace all species in min_block by new combined species
    species_list = [s for s in species_list if s not in min_block]
#    deleted = 0
#    for i in range(len(species_list)):
#      if species_list[i - deleted] in min_block:
#        species_list[i-deleted : i-deleted+1] = []
#        for subscript in subscript_list:
#          subscript[i-deleted : i-deleted+1] = []
#        deleted += 1
    species_list.append(parent_species)
    for i in range(len(subscript_list)):
      subscript_list[i].append(parent_species.orthologs[i].subscript)
    # Remove min_block, substitute it by new species in all other blocks
    potential_blocks.remove(min_block)
    for b in potential_blocks:
      try:
        if min_block[0] in b:
          for s in min_block:
            b.remove(s)
          b.append(parent_species)
      except ValueError:
        print potential_blocks
        print b
        print min_block
        print s
        print "Min-block algorithm failed! (Build Subgraph)"
  is_ancestor = True
  for name in gene_names:
    if species_list[0].Get_Subscript(name) !=  \
       ancestor_species.Get_Subscript(name):
      is_ancestor = False
  if is_ancestor:
    return species_list[0]
  else:
    graph.Add_Vertex(ancestor_species)
    graph.Add_Path(ancestor_species, species_list[0], gene_names)
  return False

def Add_Back_Species(species, graph, possible_parents, gene_names):
  # Add back species causing conflicts,
  # which is another hitting set problem

  set_collection = []
  parents_ortholog_list = []
  if DEBUG_LEVEL > 2:
    print "Current species list : "
    for s in graph.V:
      print "%s %r" % (s.name, s.orthologs)
    print "Interbreeded species : %s %r" % (species.name, species.orthologs)
  for name in gene_names:
    possible_parent_set = []
    ortholog = Find_Name(species.orthologs, name)
    if not ortholog.exists:
      continue
    found = False
    # We start by looking if this ortholog already exists,
    # this setting would prevent same ortholog arose twice
    prefix = ortholog.subscript[:]
    while not found:
      # We assume all species were evolved from extinct species, i.e,
      # internal vertecies in the graph. Therefore, no current species
      # could be parent of any vertex. When selecting the parents,
      # we would not consider the graph.V[0:len(species_name_list)]
      # since they denote the current species.
      for p in possible_parents:
        if p.Get_Subscript(name) == prefix:
          found = True
          possible_parent_set.append(graph.V.index(p))
      if found:
        break
      prefix = prefix[:-1]
    set_collection.append(possible_parent_set)
    parents_ortholog_list.append([name, prefix, False])
  if DEBUG_LEVEL > 1:
    print "Orthologs hitting set : %r" % set_collection
  parents = Hitting_Set_Solver(set_collection, range(len(graph.V)))
  if DEBUG_LEVEL > 1:
    print "Parents for species added back : %r" % parents
  if len(parents) == 1:  # No interbreeding event
    cover_list = [name for [name, prefix, is_covered] in
        parents_ortholog_list]
    graph.Add_Path(graph.V[parents[0]], species, cover_list)
    return

  been_covered = [False for name in gene_names
                        if Find_Name(species.orthologs, name).exists]
  for i in range(len(parents)):
    partial_parent = graph.V[parents[i]]
    cover_list = []
    for o in parents_ortholog_list:
      [name, subscript, is_covered] = o
      if is_covered:
        continue
      if partial_parent.Get_Subscript(name) == subscript:
        o[2] = True    # Change is_covered to True
        cover_list.append(name)
    graph.Add_Path(partial_parent, species, cover_list)

def Refine_Graph(graph):
  remove_vertices = []
  for v in graph.V:
    if len(v.children) == 1 and len(v.edges_from_parents) == 1:
      child = v.children[0]
      parent = v.edges_from_parents[0].fromnode
      delete_edge1 = v.edges_from_parents[0]
      for e in child.edges_from_parents:
        if e.fromnode == v:
          delete_edge2 = e
          break
      carried_orthologs = delete_edge2.carried_orthologs[:]
      new_edge = Edge(parent, child, carried_orthologs)
      parent.children.remove(v)
      parent.children.append(child)
      child.edges_from_parents.remove(delete_edge2)
      child.edges_from_parents.append(new_edge)
      remove_vertices.append(v)
      graph.E.remove(delete_edge1)
      graph.E.remove(delete_edge2)
      graph.E.append(new_edge)
  graph.V = [v for v in graph.V if v not in remove_vertices]

def Merge_Interbreeding_Events(graph):
  global miss_index
  changed = True
  while changed:
    changed = False
    for v in graph.V:
      for e1 in v.edges_from_parents:
        for e2 in v.edges_from_parents:
          if e1 == e2:
            continue
          v1 = e1.fromnode
          v2 = e2.fromnode
          if v2 in v1.children:
            changed = True
            new_v = v1.Copy()
            new_v.name = "miss" + str(miss_index)
            miss_index += 1
            for name in e2.carried_orthologs:
              new_ortholog = Find_Name(new_v.orthologs, name)
              new_subscript = v2.Get_Subscript(name)
              new_ortholog.subscript = new_subscript[:]
            new_carried_orthologs = e1.carried_orthologs[:]
            new_carried_orthologs.extend(e2.carried_orthologs)
            new_e = Edge(new_v, v, new_carried_orthologs)
            new_v.children.append(v)
            graph.V.append(new_v)
            graph.E.remove(e1)
            graph.E.remove(e2)
            graph.E.append(new_e)
            v.edges_from_parents.remove(e1)
            v.edges_from_parents.remove(e2)
            v.edges_from_parents.append(new_e)
            v1.children.remove(v)
            v2.children.remove(v)
            if len(v2.edges_from_parents) != 1:
              print "Direct descendent has more than one parent"
              sys.exit(4)
            if v2.edges_from_parents[0] in graph.E:
              graph.E.remove(v2.edges_from_parents[0])
            if len(v1.children) == 1 and len(v1.edges_from_parents) == 1:
              parent = v1.edges_from_parents[0].fromnode
              v1.edges_from_parents[0].tonode = new_v
              new_v.edges_from_parents.append(v1.edges_from_parents[0])
              v1.edges_from_parents = []
              graph.V.remove(v1)
              parent.children.remove(v1)
              parent.children.append(new_v)
            else:
              v1.children.remove(v2)
              v1.children.append(new_v)
              v2.edges_from_parents[0].tonode = new_v
              parent_edge = Edge(v1, new_v,
                  v2.edges_from_parents[0].carried_orthologs[:])
              new_v.edges_from_parents.append(parent_edge)
              graph.E.append(parent_edge)
            if len(v2.children) == 1:
              if v2.children[0] == v:
                child = v2.children[1]
              else:
                child = v2.children[0]
              new_v.children.append(child)
              for e in child.edges_from_parents:
                if e.fromnode == v2:
                  e.fromnode = new_v
                  break
              graph.V.remove(v2)
            else:
              new_v.children.append(v2)
              v2.edges_from_parents[0].fromnode = new_v
              graph.E.append(v2.edges_from_parents[0])
            break
        if changed:
          break
      if changed:
        break
  return

def Test_Merge():
  s1 = Species("s1")
  s2 = Species("s2")
  s3 = Species("s3")
  s4 = Species("s4")
  s5 = Species("s5")
  s6 = Species("s6")
  s7 = Species("s7")
  s1.Add_Ortholog(Ortholog("g", "", True))
  s1.Add_Ortholog(Ortholog("h", "", True))
  s2.Add_Ortholog(Ortholog("g", "2", True))
  s2.Add_Ortholog(Ortholog("h", "1", True))
  s3.Add_Ortholog(Ortholog("g", "22", True))
  s3.Add_Ortholog(Ortholog("h", "12", True))
  s4.Add_Ortholog(Ortholog("g", "221", True))
  s4.Add_Ortholog(Ortholog("h", "122", True))
  s5.Add_Ortholog(Ortholog("g", "222", True))
  s5.Add_Ortholog(Ortholog("h", "11", True))
  s6.Add_Ortholog(Ortholog("g", "23", True))
  s6.Add_Ortholog(Ortholog("h", "13", True))
  s7.Add_Ortholog(Ortholog("g", "223", True))
  s7.Add_Ortholog(Ortholog("h", "123", True))
  e1 = Edge(s1, s2, ["g", "h"])
  e2 = Edge(s2, s3, ["g", "h"])
  e3 = Edge(s3, s4, ["g", "h"])
  e4 = Edge(s2, s5, ["h"])
  e5 = Edge(s3, s5, ["g"])
  e6 = Edge(s2, s6, ["g", "h"])
  e7 = Edge(s3, s7, ["g", "h"])
  g1 = Graph([s1, s2, s3, s4, s5, s6, s7], [])
  g1.Add_Edge(e1)
  g1.Add_Edge(e2)
  g1.Add_Edge(e3)
  g1.Add_Edge(e4)
  g1.Add_Edge(e5)
  g1.Add_Edge(e6)
  g1.Add_Edge(e7)
  for v in g1.V:
    parents = [e.fromnode for e in v.edges_from_parents]
    print "Name: %s Parents: %r Children: %r" % (v.name, parents, v.children)
  Merge_Interbreeding_Events(g1)
  for v in g1.V:
    parents = [e.fromnode for e in v.edges_from_parents]
    print "Name: %s Parents: %r Children: %r" % (v.name, parents, v.children)

def Reconstruct_Graph_Of_Life(scene, gene_names):
  global miss_index
  miss_index = 0
  graph_of_life = Graph(scene.species_list,[])
  (potential_blocks, species_list, subscript_list) = \
      Find_Common_Blocks(scene.species_list, gene_names)
  add_back_species = []
  while len(species_list) > 1:
    # First, we are going to determine the min_block.
    min_block = Find_Min_Block(potential_blocks)
    # This is the only special case that more than one block could join
    # together. For example: [s1 s2] [s1 s3] [s2 s3] => [s1 s2 s3]
    # Also in this case, no interbreeding event was needed
    if len(min_block) == 2:
      (potential_blocks,min_block) = Refine_Blocks(potential_blocks,min_block)
      remove_index = []
    # Remove species causing conflicts
    else:
      remove_index = Count_Interbreeding_Events_Needed(min_block,
                                                       gene_names)
    if DEBUG_LEVEL > 1:
      print "Remove species : %r" % [min_block[i]
                                     for i in range(len(min_block))
                                     if i in remove_index]
    start_index = miss_index
    ancestor_species = Construct_Parent_Species(min_block, gene_names)
    miss_index += 1
    subtree_species = [min_block[i] for i in range(len(min_block))
                                    if i not in remove_index]
    if DEBUG_LEVEL > 1:
      print "Subtree Species : %r" % subtree_species
    ancestor_change = Build_Subgraph(graph_of_life, ancestor_species,
                                     subtree_species, gene_names)
    if ancestor_change:
      ancestor_species = ancestor_change
      start_index += 1
    if DEBUG_LEVEL > 1:
      print "Ancestor Species : " + ancestor_species.name
    end_index = miss_index
    possible_parents = []
    for i in range(start_index, end_index):
      possible_parents.append(Find_Name(graph_of_life.V,
                                        "miss" + str(i)))
    possible_parents.extend(subtree_species)
    if DEBUG_LEVEL > 1:
      print "Possible parents : %r" % possible_parents
    for i in range(len(min_block)):
      if i in remove_index:
        if DEBUG_LEVEL > 1:
          print "Species to be added : " + min_block[i].name
        start_index = end_index
        Add_Back_Species(min_block[i],
                         graph_of_life,
                         possible_parents,
                         gene_names)
        end_index = miss_index
        for i in range(start_index, end_index):
          possible_parents.append(Find_Name_Or_Die(graph_of_life.V,
                                                   "miss" + str(i)))
        if DEBUG_LEVEL > 2:
          print graph_of_life
    # Replace all species in min_block by new combined species
    species_list = [s for s in species_list if s not in min_block]
#    deleted = 0
#    for i in range(len(species_list)):
#      if species_list[i - deleted] in min_block:
#        species_list[i-deleted : i-deleted+1] = []
#        for subscript in subscript_list:
#          subscript[i-deleted : i-deleted+1] = []
#        deleted += 1
    species_list.append(ancestor_species)
    for i in range(len(subscript_list)):
      subscript_list[i].append(
          ancestor_species.Get_Subscript(gene_names[i]))

    # Remove min_block, substitute it by new species in all other blocks
    potential_blocks.remove(min_block)
    for b in potential_blocks:
      try:
        if min_block[0] in b:
          for s in min_block:
            b.remove(s)
          b.append(ancestor_species)
      except ValueError:
        print potential_blocks
        print b
        print min_block
        print s
        print "Min-block algorithm failed! (Reconstruct Graph of Life)"
    if DEBUG_LEVEL > 2:
      print graph_of_life
  Refine_Graph(graph_of_life)
  Merge_Interbreeding_Events(graph_of_life)
  return graph_of_life

# There are two things to do in Refine_Graph. The first one is contracting
# vertices having exactly one parent and one child. In this case, we
# delete this vertex and add an edge from its parent to its child directly.
# The second thing is renumbering the miss-index from the root.

def Graph_Renumbering(graph):
  for v in graph.V:
    if len(v.edges_from_parents) == 0:
      root = v
      break
  q = Queue([], len(graph.V))
  q.Put(root)
  while q.Length() > 0:
    v = q.Get()
    for child in v.children:
      if len(child.children) > 0:
        q.Put(child)
  internal_nodes = q.GetAll()
  if DEBUG_LEVEL > 2:
    print "Internal Nodes: %r" % internal_nodes
  new_miss_index = 0
  for v in internal_nodes:
    v.name = "miss" + str(new_miss_index)
    new_miss_index += 1
  graph.E.sort(Edge_Comparison)
  return

def List_Command_Line_Flags():
  print ("Options:\n"
    "(-a) --annealing: use simulated annealing method\n"
    "(-f) --filename= input file name\n"
    "(-w) --delimiter: set delimiter in Newick tree format. Default is"
    " whitespace\n"
    "(-d) --debug= debug level\n"
    "(-t) --maximum_selecting_time= maximum seconds allow to select"
    " the trees\n"
    "                               Negative value for no limit\n"
    "(-u) --maximum_combining_time= maximum seconds allow to combine"
    " the trees\n"
    "                               Negative value for no limit\n"
    "(-n) --restart_iteration= times of restarting simulated annealing\n"
    "                          or consensus method\n"
    "(-s) --hitting_iteration= number of iterations used for hitting set"
    " solver\n"
    "(-e) --evaluating_iteration= number of iterations used for graph"
    " evaluation\n"
    "(-g) --graph_iteration= times of generating combined graph given"
    " a set of trees\n"
    "(-q) --frequency_threshold= minimum frequency of trees used in"
    " consensus method\n"
    "(-i) --initial_temperature= for simulated annealing\n"
    "(-z) --final_temperature= for simulated annealing\n"
    "(-p) --alpha= for simulated annealing\n"
    "(-m) --multiplier= should be greater than the maximum score in\n"
    "                   gene trees times the number of genes in general\n"
    "(-r) --reverse_score: if present, higher score means better confidence\n"
    "(-h) --help: print this menu\n")
  return

def main():
  global ANNEALING_METHOD, DELIMITER, DEBUG_LEVEL
  global MAX_SELECTING_TIME, MAX_COMBINING_TIME
  global MAX_RESTART_ITERATION, MAX_HITTING_SET_ITERATION
  global MAX_EVALUATION_ITERATION, MAX_GRAPH_ITERATION
  global FREQUENCY_THRESHOLD, INITIAL_TEMPERATURE, FINAL_TEMPERATURE, ALPHA
  global MULTIPLIER, REVERSE_SCORE
  global miss_index, SINGLE_TREE_CASE, MISSING_ORTHOLOG
  try: options, args = getopt.getopt(sys.argv[1:],
      "af:o:w:d:t:u:n:s:e:g:q:i:z:p:m:rh",
      ["annealing", "filename=", "outputfile=", "delimiter=", "debug=",
       "max_selecting_time=", "max_combining_time",
       "restart_iteration=", "hitting_iteration=",
       "evaluating_iteration", "graph_iteration=",
       "frequency_threshold", "initial_temperature=", "final_temperature=",
       "alpha=", "multipier=", "reverse_score", "help"])
  except getopt.GetoptError:
    print 'Invalid option list'
    print 'Add --help for correct usage'
    sys.exit(2)
  filename = 'data'
  outputfile = 'data.dot'
  for option, arg in options:
    if option in ("-h", "--help"):
      List_Command_Line_Flags()
      return
    elif option in ("-f", "--filename"):
      filename = arg
    elif option in ("-o", "--outputfile"):
      outputfile = arg
    elif option in ("-a", "--annealing"):
      ANNEALING_METHOD = True
    elif option in ("-w", "--delimiter"):
      forbid = "()[]:"
      for c in forbid:
        if c in arg:
          print '%c is not allowed as a delimiter' % c
          return
      DELIMITER = arg
    elif option in ("-d", "--debug"):
      DEBUG_LEVEL = int(arg)
    elif option in ("-t", "--max_selecting_time"):
      MAX_SELECTING_TIME = float(arg)
    elif option in ("-u", "--max_combining_time"):
      MAX_COMBINING_TIME = float(arg)
    elif option in ("-n", "--restart_iteration"):
      MAX_RESTART_ITERATION = int(arg)
    elif option in ("-s", "--hitting_iteration"):
      MAX_HITTING_SET_ITERATION = int(arg)
    elif option in ("-e", "--evaluating_iteration"):
      MAX_EVALUATING_ITERATION = int(arg)
    elif option in ("-g", "--graph_iteration"):
      MAX_GRAPH_ITERATION = int(arg)
    elif option in ("-q", "--frequency_threshold"):
      FREQUENCY_THRESHOLD = int(arg)
    elif option in ("-i", "--initial_temperature"):
      INITIAL_TEMPERATURE = float(arg)
    elif option in ("-z", "--final_temperature"):
      FINAL_TEMPERATURE = float(arg)
    elif option in ("-p", "--alpha"):
      ALPHA = float(arg)
    elif option in ("-m", "--multiplier"):
      MULTIPLIER = float(arg)
    elif option in ("-r", "--reverse_score"):
      REVERSE_SCORE = 1
  if filename != 'data' and outputfile == 'data.dot':
    outputfile = filename + '.dot'
  start_time = time.clock()
  Decompress_Input(filename)
  if DEBUG_LEVEL > 1:
    print "species name list : %r\n" % species_name_list
    print "Number of species : %d\n" % len(species_name_list)
    print "gene name list : %r\n" % gene_name_list
    print "Number of genes : %d\n" % len(gene_name_list)
  if not ANNEALING_METHOD:
    (best_scene, gene_names) = Consensus_Method(start_time)
    best_score = best_scene.Evaluation()
  else:
    gene_names = gene_name_list
    best_scene = Simulated_Annealing()
    best_score = best_scene.Evaluation()
    if not SINGLE_TREE_CASE or MISSING_ORTHOLOG:
      for iteration in range(MAX_RESTART_ITERATION):
        if DEBUG_LEVEL > 0:
          print "### Round %d ###" % iteration
        scene = Simulated_Annealing()
        score = scene.Evaluation()
        if score < best_score:
          best_scene = scene
          best_score = score
  if DEBUG_LEVEL > 0:
    print "Best score achieved : %d" % best_score
    print "Trees selected :"
    for t in best_scene.selected_trees:
      print t
    print "Start reconstructing graph"
    for s in best_scene.species_list:
      print "%s %r" % (s.name, s.orthologs)
  best_graph = Reconstruct_Graph_Of_Life(best_scene, gene_names)
  best_score = best_graph.Evaluation()
  if DEBUG_LEVEL > 1:
    print "Current graph score = %d" % best_score
  for iteration in range(MAX_GRAPH_ITERATION):
    graph_of_life = Reconstruct_Graph_Of_Life(best_scene, gene_names)
    (score, number_of_species) = graph_of_life.Evaluation()
    if score < best_score:
      best_graph = graph_of_life
      best_score = score
      if DEBUG_LEVEL > 1:
        print "Better graph found! Score = %d" % best_score
#  print best_graph
  Graph_Renumbering(best_graph)
  if DEBUG_LEVEL > 1:
    for v in best_graph.V:
      print "%s %r" % (v.name, v.orthologs)
  if DEBUG_LEVEL > 1:
    print best_graph
  end_time = time.clock()
  (events, species) = best_graph.Evaluation()
  print 'Number of interbreeding events in this graph: %d' % (events + 0.01)
  print 'Number of interbreeded species in this graph: %d' % (species + 0.01)
  print 'Time elapsed: %f seconds' % (end_time - start_time)
  fp = file(outputfile, 'w')
  fp.write(best_graph.__repr__())
  return

if __name__ == "__main__":
  main()
