# Try to handle redundant ligations.
# The strategy will be to look just at the sevens and fives at the interfaces
# Also the restricted sequences at the interfaces.
# So identify the redundant ligations and take the already tried sequences
# and check for issues.
# Can check the entire ligated complex for restricted sequences.
# Can check just the six before each interface + the one after the interface
# for 7s up to 1 and 6. 
# Can check just the four before each interface + the one after
# for fives up to 1 and 4. 
# The same goes for any ligated complements.
# Any new strands in the ligation should be extended as usual.

# Another possibility: simply take strands that already exist and feed
# them in as if new. Just can't backtrack for those.
# To do this however, we must know when we don't need to check for complements,
# i.e. outside of the border region.

# Or simply glue together anything that can be glued and then check for
# all border regions. Maybe enhance the FIVES and SEVENS structures
# to give labels to the strand-position pairs for which we have to exclude
# complements. So if strand x position 13 excludes say a FIVE AACTG
# then if we are back at x, 13 we don't have to worry about that 
# complement. This doesn't quite work however, because 13 might
# be near the end of strand x.

# Definitely need to keep track of which parts of each strand are being
# taken care of in FIVES and SEVENS, x,13 to y,3.
# So we go through laying out the strands and when we check, we check
# whether we've already taken care of this. We need to do this because
# we might have say [x,y] several times.
# Using this approach we again advance one by one and check every FIVES
# and SEVENS but now it's a pair consisting of a boolean and who was
# responsible.


import random

# UTILS

# x = set([1,2,3,3,1])
# x.intersection(y)
# x.union(y)
# x.issuperset(y) or x >= y
# difference x - y


full_restricted_seq = ['AAAA', 'TTTT', 'CCC', 'GGG', 'GGGG', 'CCCC']# + palindromic septameric units + palindromic pentameric units + alternating pyurine + pyrimidine


def formRestrictedSequences():
    pyrimadines = 'TC'
    purines = 'AG'

    lets = 'ACGT'
    #alternating purine - pyrimidines
    for pur1 in purines:
        for pyr1 in pyrimadines:
            for pur2 in purines:
                for pyr2 in pyrimadines:
                    for pur3 in purines:
                        for pyr3 in pyrimadines:
                            full_restricted_seq.append(pur1 + pyr1 + pur2 + pyr2 + pur3 + pyr3)
                            full_restricted_seq.append(pyr1 + pur1 + pyr2 + pur2 + pyr3 + pur3)


def reverse_complement(s):
    """
    get reverse complement of given sequence
    :param s:
    :return:
    """

    rev = s[::-1]
    return complement(rev)


def complement(s):
    """
    get complement of given sequence
    :param s:
    :return:
    """

    out = ''
    for let in s:
        out+= findcomp(let)
    return out


def random_base():
    """
    get one random base
    :return: char
    """

    baselist = ['A', 'T', 'C', 'G']
    return random.choice(baselist)


def random_sequence(length):
    """
    get random sequence with given length
    :param length:
    :return: str
    """

    seq = ""
    for i in range(1, length):
        seq += random_base()
    return seq


def palindrome_check(seq):
    """
    check if given sequence is palindrome
    :param seq:
    :return: bool
    """

    # print("palindrome: " + seq + "  = "+ str((seq == reverse_complement(seq))))
    if "o" in seq:
        return False
    return seq == reverse_complement(seq)

# check for palindromes from the last k on forward
# returns True is there is a palindrome and False if not
def multipalindrome_check(seq, k):
  i = k
  while(i < len(seq)):
    myseq = seq[-i:]
    if palindrome_check(myseq):
      return True
    i+= 1
  return False


def genfives():
    """
    generates all possible non restricted five nucleotide units as key and values = FALSE into
    global FIVES variable

    :return: None
    """

    global FIVES
    lets = 'ACGT'
    for i1 in lets:
        for i2 in lets:
            for i3 in lets:
                for i4 in lets:
                    for i5 in lets:
                        new_five = i1 + i2 + i3 + i4 + i5
                        restricted = False

                        # if palindrome_check(new_five):
                            # print(new_five)

                        # for sequence in {'AAAA', 'TTTT', 'CCC', 'GGG'}:
                            # if sequence in new_five:
                                # restricted = True
                                # print(new_five + " is restricted.")
                                # break

                        # only add if not restricted
                        if not restricted:
                            FIVES[new_five] = [False, []]

    # print("Length of FIVES: " + str(len(FIVES)) )


def gensevens():
    """
    generates all possible non restricted seven nucleotide units as key and value = FALSE into global SEVENS variable

    :return: None
    """

    global SEVENS
    global RESTRICTED_SEQ
    lets = 'ACGT'
    for i1 in lets:
        for i2 in lets:
            for i3 in lets:
                for i4 in lets:
                    for i5 in lets:
                        for i6 in lets:
                            for i7 in lets:
                                new_seven = i1 + i2 + i3 + i4 + i5 + i6 + i7
                                restricted = False
                                if not restricted:
                                    SEVENS[new_seven] =  [False, []]


# APPLICATION-SPECIFIC

# mylen is the total length of a strand which could be a ligation
# doublezones is a sequence of indices during which the reverse complement
# is also supposed to be made.
# For example there could be x[y/~y]z[w/~w].
# In that case, if x, y, z, w are supposed to all be of length 20 say,
# then doublezones would be [[20,39],[60,79]].  
# For simplicity of bookeeping, this will always be created as a 
# doublestrand so there will be two arrays with array 0 being the primary
# and 1 being the reverse complement. When the reverse complement has an X,
# that means there is no nucleotide so that no constraint is violated.
# For example, at the end, if the two arrays have something like
# array 0: ACTAACCTTGG
# array 1: XXXXTGGAAXX
# Then we have ACTA, ACCTT, GG from the top and AAGGT
# The AAGGT is the 5'-3' version of TGGAA

# Besides the two arrays, we want to keep track of which 
# other sequences we've eliminated (e.g. reverse complements of 5-letter
# sequences; reverse complements with one mismatch of 7-letter sequences).
# We keep track of this in two dictionaries.
# If we have to undo a choice of a base, then we must undo the effects on all
# those dictionaries.
# We must also be sure to advance the base to the next one for that position.
# If not available, then we must undo again.
# If we undo again.
# We use outstands to check for strands we've already created.
# totlen is the total length of the location, doublezones are where
# there must be a complement, lig has the names of the strands
def addnewstrand(totlen, doublezones, mylen, lig):
  out = ["", ""] # primary is location 0
  orderindex = [] # index of letter at that position 
	# (if get beyond 3, then must retreat by 1).
  order = (["A","T","G","C"])
  random.shuffle(order) # This will be the order we will use throughout
  i = 0
  # For each position, we need to keep track of the sequence name
  # and location within the sequence and the last location either within
  # that sequence or in the new one. 
  # If we are on a ligation with strands we've already seen 
  # and we get in trouble
  # while  in a strand we've already seen, then give up and start over.
  while(i < totlen):
    mytrip = strandloc(0,i, mylen, lig): 
    currentstrand = mytrip[0] # because first argument is 0, mytrip[1] == mytrip[2]
    currentlocwithinstrand = mytrip[2] 
    seenpair = seen(i,mylen,lig) 
      # if we've seen this location of this ligation and what the base was
    if i < len(orderindex) and (seenpair[0] == True): 
      # we already have reached this location, it didn't work and we're
      # in a strand we've already specified. In that case, reject the whole lig.
      print "Cannot add lig ", lig , " because of strand "  
      print lig[int(i / mylen)], " location: ", i % mylen
      return([[],[]])
    if (i < len(orderindex)) and (seenpair[0] == False): 
      # we already have reached this location and this is a new strand
      assert (i+1) == len(orderindex)
      foundFlag = False
      while not foundFlag: # when foundFlag is True then we are ready to try a letter at the location i.
        if orderindex[i] < 3:
          removeconstraints(out, i) # remove constraints on FIVE and SEVEN
          out[0] = out[0][:-1]
          out[1] = out[1][:-1]
          orderindex[i]+= 1 # can try next letter
          foundFlag = True
        elif 0 < len(orderindex): 
          # the location i has tried all letters, so back up
          removeconstraints(out, i) # remove constraints on FIVE and SEVEN
          out[0] = out[0][:-1]
          out[1] = out[1][:-1]
          orderindex = orderindex[:-1]
          # in the next time through the loop we will perhaps try a new letter
          i = i-1
          seenpair = seen(i,mylen,lig)
        elif 0 == len(orderindex):
          print "Cannot add this lig."
          return([[],[]])
        else:
          print "We should never get here"
          return([[],[]])
    elif i == len(orderindex): # we have never reached this location
      if seenpair[0]: # we have already seen this string
        # assign the index to the letter it should be
        j = order.index(seenpair[1])
        orderindex.append(j)
      else:
        orderindex.append(0)
    # At this point, we are at a certain index i and orderindex[i]
    # is a particular base.
    # ??? At this point if we know this strand then we should use the base
    # from it.
    mybase = order[orderindex[i]]
    out[0]+=mybase
    if doublestrandedpos(i, doublezones):
      out[1]+=findcomp(mybase)
    else:
      out[1]+='X'
    # if not testok(out, mylen, lig) and alreadyseen(out, mylen, lig):
       # then try again ??
    if testok(out, mylen, lig): # take into account whether we've seen this or not
      # update FIVEs and update SEVENs
      if len(out[0]) >= 5:
        myseq = reverse_complement(out[0][-5:]) 
	# print("sequence: " + out[0][-5:] + " has reverse comp: " + myseq)
	mytrip = strandloc(5,len(out)-1, mylen, lig) 
        mytripcomp = [formcomp(mytrip[0]), mytrip[1],mytrip[2]]
           # returns something like ['x',13]
        FIVES[myseq] = [True,mytrip] # reverse complement of last 5 
        myseq = reverse_complement(out[1][-5:][::-1]) 
        if 'X' not in myseq:
          FIVES[myseq] = [True,mytripcomp] # ??change this to the beginning and end
      if len(out[0]) >= 7:
	mytrip = strandloc(7,len(out)-1, mylen, lig) 
        mytripcomp = [formcomp(mytrip[0]), mytrip[1],mytrip[2]]
        myseq = reverse_complement(out[0][-7:]) 
        updatesevens(myseq, mytrip )
        myseq = reverse_complement(out[1][-7:][::-1]) 
        if 'X' not in myseq:
          updatesevens(myseq, mytripcomp)
      i=i+1 # can advance to next base
  # print("Here is the main strand out[0]")
  # print(out[0])
  # print("Here is the complement strand out[1]")
  # print(out[1])
  return out

# find the locations of the beginning and end of an L length
# sequence where L is for example 5 or 7 and ends at location i
def strandloc(L,indextohere, mylen, lig): 
  startstrandname = lig[int((indextohere-L) / mylen)]
  endstrandname = lig[int(indextohere / mylen)]
  locwithinendstrand = indextohere % mylen
  return ([startstrandname, endstrandname, locwithinendstrand])

# has this location in this strand been seen before?
# If so, return the strand name and the base of that position
def seen(i,mylen,lig): 
  strandname = lig[int(i / mylen)]
  locwithinstrand = i % mylen # modulo
  for o in outstrands:
    if o[0] == strandname:
      return [True, o[1][locwithinstrand]]
  return [False, []]

# approximate sevens
# given a seven, find all the versions of sevens with either the middle base
# or one of the bases on the sides changes
def approximateseven(seq):
  out = []
  for let in 'ATGC':
    out.append(seq[:3] + let + seq[-3:])
    out.append(seq[:2] + let + seq[-4:])
    out.append(seq[:4] + let + seq[-2:])
  return(out)

# remove the constraints having to do with the last letter
def removeconstraints(out, i):
  global FIVES
  if i >= 4:
    myseq = out[0][-5:]
    rcmyseq = reverse_complement(myseq)
    FIVES[rcmyseq] = [False,[]] # reset it
    myseq = out[1][-5:][::-1]
    if 'X' not in myseq:
      rcmyseq = reverse_complement(myseq)
      FIVES[rcmyseq] = [False,[]] # reset it
  if i >= 6:
    myseq = out[0][-7:]
    rcmyseq = reverse_complement(myseq)
    resetsevens(rcmyseq)
    myseq = out[1][-7:][::-1]
    if 'X' not in myseq:
      rcmyseq = reverse_complement(myseq)
      resetsevens(rcmyseq)

# given a seven, update it and all its neighbors (different by one
# either in the center or one on either side)
def updatesevens(seq, mytrip):
  global SEVENS
  toupdate = approximateseven(seq)
  for u in toupdate:
    SEVENS[u] = [True, mytrip] 

# given a seven, reset it and all its neighbors (different by one
# either in the center or one on either side)
def resetsevens(seq):
  global SEVENS
  toreset = approximateseven(seq)
  for r in toreset:
    SEVENS[r] = [False, []]

# if we've already seen these particular instances of 7s, then did they
# come from the same sequences at the same positions
def sevenscheck(seq, mytrip7):
  global SEVENS
  tocheck = approximateseven(seq)
  for c in tocheck:
    if SEVENS[c][0] 
      if not SEVENS[c][1] == mytrip7: # some other triple is involved
        return True
  return False

# find the complement of a base
def findcomp(mybase):
  if mybase == 'A':
    return('T')
  if mybase == 'T':
    return('A')
  if mybase == 'C':
    return('G')
  if mybase == 'G':
    return('C')
  return(mybase)

# determine whether i is in a doublezone
def doublestrandedpos(i, doublezones):
  for d in doublezones:
    if (i >= d[0]) and (i <= d[1]):
      return True
  return False

# determine whether the last base of a pre-existing sequence causes any problems
# Pre-existing sequences are given to us.
def testinitial_ok(out):
  L = len(out)
  # first take care of restricted sequences
  for r in full_restricted_seq:
   if L >= len(r):
     if r == out[-len(r):]:
       # print( "Forward is restricted")
       return False
  # Now look for palindromes if length of the strand is at least 4
  if (len(out) > 4) and multipalindrome_check(out, 5):
    print( "Palindrome check for palindromes 5 or greater.")
    return False
  return True
   
# determine whether the last base added to out (and its complement)
# causes any issues.
# see whether last base added (and possibly its complement) are ok
def testok(out, mylen, lig):
  global FIVES
  myindex = len(out) - 1
  mytriple5 = strandloc(5,myindex, mylen, lig) # triple for 5 for current ligation
  mytriple7 = strandloc(7,myindex, mylen, lig) # triple for 7
  L = len(out[0])
  myrevcomp = out[1][::-1]
  # first take care of restricted sequences
  for r in full_restricted_seq:
   if L >= len(r):
     if r == out[0][-len(r):]:
       # print( "Forward is restricted")
       return False
     if r == myrevcomp[:len(r)]:
       # print( "Rev complemented is restricted")
       return False
  # Now look for palindromes if length of the strand is at least 4
  if (len(out[0]) > 4) and multipalindrome_check(out[0], 5):
    print( "Palindrome check for palindromes 5 or greater.")
    return False
  # now take care of FIVES and SEVENS
  if (len(out[0])) >= 5:
    myseq = reverse_complement(out[0][-5:]) 
    if FIVES[myseq][0]: # this has already been used
      previoustrip5 =  FIVES[myseq][1]
      if not previoustrip5 == mytrip5: # was it used by the same strand(s) at the same positions?
        # print( "FIVEs check forward")
        return False
    myseq = reverse_complement(myrevcomp[:5])
    if (('X') not in myseq) and FIVES[myseq][0]:  
      previoustrip5 =  FIVES[myseq][1]
      if not previoustrip5 == mytrip5: # if we haven't seen this yet
        # print( "FIVEs check backwards")
        return False
  if (len(out[0])) >= 7:
    myseq = reverse_complement(out[0][-7:]) 
    if sevenscheck(myseq, mytrip7): # there is a match of myseq or some neighbor
        # print( "SEVENs check forward")
        return False
    myseq = reverse_complement(myrevcomp[:7])
    if (('X') not in myseq) and sevenscheck(myseq, mytrip7):
        # print( "SEVENs check backwards")
        return False
  return True


# form all sequences assuming each atomic sequence is of length mylen
# This handles the disjoint ligations only.
def formseqs(mylen):
  global outstrands
  # first take care of disjointligations
  for i in range(len(disjointligations)):
    lig = disjointligations[i]
    print('lig is: ')
    print(', '.join(lig))
    totlen = mylen * len(lig)
    doublezones = []
    doublezoneloc = 0
    for x in lig:
      if hascomp(x):
        doublezones.append([doublezoneloc, doublezoneloc + mylen - 1])
      doublezoneloc += mylen
    ret = addnewstrand(totlen, doublezones, mylen, lig)
    if 0 < len(ret[0]):
       doublezoneloc = 0
       for x in lig: 
         outstrands.append([x, ret[0][doublezoneloc:doublezoneloc+mylen]])
         if hascomp(x):
           outstrands.append([formcomp(x), (ret[1][doublezoneloc:doublezoneloc+mylen])[::-1]])
         doublezoneloc+=mylen
    else: # nothing returned from addnewstrand
      print("New ligation: ")
      print(', '.join(lig))
      print ("failed to form.")

# ??? UNUSED
# Now take care of overlapligations and unligated strands
# Somehow, we have to gather the previously used strands and add them in.
# As we do that, we do not have to recheck the FIVES and SEVENS till we get
# to the border regions.
# Complements will already be taken care of.
def formseqs_overlap(mylen):
  global outstrands
  # first take care of overlapligations
  for i in range(len(overlapligations)):
    lig = overlapligations[i]
    print('lig is: ')
    print(', '.join(lig))
    totlen = mylen * len(lig)
    doublezones = []
    doublezoneloc = 0
    for x in lig:
      if hascomp(x):
        doublezones.append([doublezoneloc, doublezoneloc + mylen - 1])
      doublezoneloc += mylen
    ret = addnewstrand(totlen, doublezones, mylen, lig)
    if 0 < len(ret[0]):
       doublezoneloc = 0
       for x in lig: 
         outstrands.append([x, ret[0][doublezoneloc:doublezoneloc+mylen]])
         if hascomp(x):
           outstrands.append([formcomp(x), (ret[1][doublezoneloc:doublezoneloc+mylen])[::-1]])
         doublezoneloc+=mylen
    else: # nothing returned from addnewstrand
      print("New ligation: ")
      print(', '.join(lig))
      print ("failed to form.")
  print("Now take care of unligated strands")
  for s in allstrands:
    donestrands = [row[0] for row in outstrands]
    if s not in donestrands:
      print('strand is: ' + s)
      doublezones = []
      if hascomp(s):
        doublezones.append([0, mylen - 1])
      ret = addnewstrand(mylen, doublezones, mylen, [s]) # only a single strand
      if 0 < len(ret[0]):
        outstrands.append([s, ret[0][0:mylen]])
        if hascomp(s):
          outstrands.append([formcomp(s), (ret[1][0:mylen])[::-1]])
      else:
        print("Strand: " + s + " could not be formed")
  return(outstrands)
       

# is the reverse complement of x in allstrands
def hascomp(x):
  posscomp = x + 'comp'
  if posscomp in allstrands:
    return(True)
  if 4 < len(x):
    if (x[-4:] == 'comp') and (x[:-4] in allstrands):
      return(True)
  return(False)
      
# form the complement name
def formcomp(x):
  posscomp = x + 'comp'
  if posscomp in allstrands:
    return(posscomp)
  if 4 < len(x):
    if (x[-4:] == 'comp') and (x[:-4] in allstrands):
      return(x[:-4])
  return(False)

# hasintersect takes one array lig and sees whether any element in that array
# is the same as any element of any array in other
def hasintersect(lig, other):
  lig_set = set(lig)
  for ot in other:
    if 0 < len(lig_set.intersection(set(ot))):
      return True
  return False
      

# DATA
FIVES = {}
genfives()
SEVENS = {}
gensevens()
outstrands = [] # strands already taken care of

formRestrictedSequences() # adds in some new restrictions

preexistingseqs = []
# preexistingseqs.append('TGCAGCCATGCCGCGTGTATGAAGAAG')
if 0 < len(preexistingseqs):
  for p in preexistingseqs:
    print "taking care of preexisting sequence: ", p
    # first check for restricted sequences and palindromes
    growingstring = ''
    for i in range(0,len(p)):
      growingstring += p[i]
      if not testinitial_ok(growingstring):
        print "ERROR IN PRE-EXISTING STRING PREFIX: ", growingstring
        # assert False
    # first the fives
    for i in range(0,len(p)-4):
      s = p[i:i+5]
      # print "s is: ", s
      FIVES[s] = [True, []]  # ?? Not sure whether we need to know beginning and end
    for i in range(0,len(p)-6):
      s = p[i:i+7]
      # print "s is: ", s
      SEVENS[s] = [True,[]]  # ?? Not sure whether we need to know beginning and end


# print(sorted(full_restricted_seq))

print("User must set the variables allstrands and ligations near the very bottom of the dnacompiler.py file.")

allstrands = [ 'x', 'xcomp', 'y', 'z', 'zcomp', 'w', 'wcomp', 'v',]
ligations = [['x','y','z'], ['wcomp', 'v'], ['x', 'v', 'q'], ['a', 'b']] 
for lig in ligations:
  for x in lig:
   if x not in allstrands:
     allstrands.append(x)
print "allstrands is: ", allstrands
# ligations no longer need to be disjoint, but we handle the disjoint
# ones first and then check about the others.

disjointligations = []
overlapligations = []
for lig in ligations:
  if hasintersect(lig, disjointligations):
    overlapligations.append(lig)
  else:
    disjointligations.append(lig)

print "disjointligations: ", disjointligations
print "overlapligations: ", overlapligations
    

# have to infer the overlap
# loop over lengths and try to extend.
mylen = 14
outstrands = []
x = formseqs(mylen) # This routine does all the work for all  ligations
print("outstrands :")
for s in outstrands:
  print(s)
donestrands = [row[0] for row in outstrands]
for lig in ligations:
  print('_'.join(lig))
  out = ''
  for x in lig:
    i = donestrands.index(x)
    out+= outstrands[i][1]
  print(out)

# gather statistics

fivecount = 0
fiveout = []
for x in FIVES:
  if FIVES[x][0]:  # ?? Not sure whether we need to know beginning and end
    fivecount+=1
    fiveout.append(x)
# print("fivecount: " + str(fivecount))
# print(sorted(fiveout))

sevencount = 0
for x in SEVENS:
  if SEVENS[x][0]:  # ?? Not sure whether we need to know beginning and end
    sevencount+=1
# print("sevencount: " + str(sevencount))