# Try to handle redundant ligations. # The strategy will be to look just at the sevens and fives at the interfaces # Also the restricted sequences at the interfaces. # So identify the redundant ligations and take the already tried sequences # and check for issues. # Can check the entire ligated complex for restricted sequences. # Can check just the six before each interface + the one after the interface # for 7s up to 1 and 6. # Can check just the four before each interface + the one after # for fives up to 1 and 4. # The same goes for any ligated complements. # Any new strands in the ligation should be extended as usual. # Another possibility: simply take strands that already exist and feed # them in as if new. Just can't backtrack for those. # To do this however, we must know when we don't need to check for complements, # i.e. outside of the border region. # Or simply glue together anything that can be glued and then check for # all border regions. Maybe enhance the FIVES and SEVENS structures # to give labels to the strand-position pairs for which we have to exclude # complements. So if strand x position 13 excludes say a FIVE AACTG # then if we are back at x, 13 we don't have to worry about that # complement. This doesn't quite work however, because 13 might # be near the end of strand x. # Definitely need to keep track of which parts of each strand are being # taken care of in FIVES and SEVENS, x,13 to y,3. # So we go through laying out the strands and when we check, we check # whether we've already taken care of this. We need to do this because # we might have say [x,y] several times. # Using this approach we again advance one by one and check every FIVES # and SEVENS but now it's a pair consisting of a boolean and who was # responsible. import random # UTILS # x = set([1,2,3,3,1]) # x.intersection(y) # x.union(y) # x.issuperset(y) or x >= y # difference x - y full_restricted_seq = ['AAAA', 'TTTT', 'CCC', 'GGG', 'GGGG', 'CCCC']# + palindromic septameric units + palindromic pentameric units + alternating pyurine + pyrimidine def formRestrictedSequences(): pyrimadines = 'TC' purines = 'AG' lets = 'ACGT' #alternating purine - pyrimidines for pur1 in purines: for pyr1 in pyrimadines: for pur2 in purines: for pyr2 in pyrimadines: for pur3 in purines: for pyr3 in pyrimadines: full_restricted_seq.append(pur1 + pyr1 + pur2 + pyr2 + pur3 + pyr3) full_restricted_seq.append(pyr1 + pur1 + pyr2 + pur2 + pyr3 + pur3) def reverse_complement(s): """ get reverse complement of given sequence :param s: :return: """ rev = s[::-1] return complement(rev) def complement(s): """ get complement of given sequence :param s: :return: """ out = '' for let in s: out+= findcomp(let) return out def random_base(): """ get one random base :return: char """ baselist = ['A', 'T', 'C', 'G'] return random.choice(baselist) def random_sequence(length): """ get random sequence with given length :param length: :return: str """ seq = "" for i in range(1, length): seq += random_base() return seq def palindrome_check(seq): """ check if given sequence is palindrome :param seq: :return: bool """ # print("palindrome: " + seq + " = "+ str((seq == reverse_complement(seq)))) if "o" in seq: return False return seq == reverse_complement(seq) # check for palindromes from the last k on forward # returns True is there is a palindrome and False if not def multipalindrome_check(seq, k): i = k while(i < len(seq)): myseq = seq[-i:] if palindrome_check(myseq): return True i+= 1 return False def genfives(): """ generates all possible non restricted five nucleotide units as key and values = FALSE into global FIVES variable :return: None """ global FIVES lets = 'ACGT' for i1 in lets: for i2 in lets: for i3 in lets: for i4 in lets: for i5 in lets: new_five = i1 + i2 + i3 + i4 + i5 restricted = False # if palindrome_check(new_five): # print(new_five) # for sequence in {'AAAA', 'TTTT', 'CCC', 'GGG'}: # if sequence in new_five: # restricted = True # print(new_five + " is restricted.") # break # only add if not restricted if not restricted: FIVES[new_five] = [False, []] # print("Length of FIVES: " + str(len(FIVES)) ) def gensevens(): """ generates all possible non restricted seven nucleotide units as key and value = FALSE into global SEVENS variable :return: None """ global SEVENS global RESTRICTED_SEQ lets = 'ACGT' for i1 in lets: for i2 in lets: for i3 in lets: for i4 in lets: for i5 in lets: for i6 in lets: for i7 in lets: new_seven = i1 + i2 + i3 + i4 + i5 + i6 + i7 restricted = False if not restricted: SEVENS[new_seven] = [False, []] # APPLICATION-SPECIFIC # mylen is the total length of a strand which could be a ligation # doublezones is a sequence of indices during which the reverse complement # is also supposed to be made. # For example there could be x[y/~y]z[w/~w]. # In that case, if x, y, z, w are supposed to all be of length 20 say, # then doublezones would be [[20,39],[60,79]]. # For simplicity of bookeeping, this will always be created as a # doublestrand so there will be two arrays with array 0 being the primary # and 1 being the reverse complement. When the reverse complement has an X, # that means there is no nucleotide so that no constraint is violated. # For example, at the end, if the two arrays have something like # array 0: ACTAACCTTGG # array 1: XXXXTGGAAXX # Then we have ACTA, ACCTT, GG from the top and AAGGT # The AAGGT is the 5'-3' version of TGGAA # Besides the two arrays, we want to keep track of which # other sequences we've eliminated (e.g. reverse complements of 5-letter # sequences; reverse complements with one mismatch of 7-letter sequences). # We keep track of this in two dictionaries. # If we have to undo a choice of a base, then we must undo the effects on all # those dictionaries. # We must also be sure to advance the base to the next one for that position. # If not available, then we must undo again. # If we undo again. # We use outstands to check for strands we've already created. # totlen is the total length of the location, doublezones are where # there must be a complement, lig has the names of the strands def addnewstrand(totlen, doublezones, mylen, lig): out = ["", ""] # primary is location 0 orderindex = [] # index of letter at that position # (if get beyond 3, then must retreat by 1). order = (["A","T","G","C"]) random.shuffle(order) # This will be the order we will use throughout i = 0 # For each position, we need to keep track of the sequence name # and location within the sequence and the last location either within # that sequence or in the new one. # If we are on a ligation with strands we've already seen # and we get in trouble # while in a strand we've already seen, then give up and start over. while(i < totlen): mytrip = strandloc(0,i, mylen, lig): currentstrand = mytrip[0] # because first argument is 0, mytrip[1] == mytrip[2] currentlocwithinstrand = mytrip[2] seenpair = seen(i,mylen,lig) # if we've seen this location of this ligation and what the base was if i < len(orderindex) and (seenpair[0] == True): # we already have reached this location, it didn't work and we're # in a strand we've already specified. In that case, reject the whole lig. print "Cannot add lig ", lig , " because of strand " print lig[int(i / mylen)], " location: ", i % mylen return([[],[]]) if (i < len(orderindex)) and (seenpair[0] == False): # we already have reached this location and this is a new strand assert (i+1) == len(orderindex) foundFlag = False while not foundFlag: # when foundFlag is True then we are ready to try a letter at the location i. if orderindex[i] < 3: removeconstraints(out, i) # remove constraints on FIVE and SEVEN out[0] = out[0][:-1] out[1] = out[1][:-1] orderindex[i]+= 1 # can try next letter foundFlag = True elif 0 < len(orderindex): # the location i has tried all letters, so back up removeconstraints(out, i) # remove constraints on FIVE and SEVEN out[0] = out[0][:-1] out[1] = out[1][:-1] orderindex = orderindex[:-1] # in the next time through the loop we will perhaps try a new letter i = i-1 seenpair = seen(i,mylen,lig) elif 0 == len(orderindex): print "Cannot add this lig." return([[],[]]) else: print "We should never get here" return([[],[]]) elif i == len(orderindex): # we have never reached this location if seenpair[0]: # we have already seen this string # assign the index to the letter it should be j = order.index(seenpair[1]) orderindex.append(j) else: orderindex.append(0) # At this point, we are at a certain index i and orderindex[i] # is a particular base. # ??? At this point if we know this strand then we should use the base # from it. mybase = order[orderindex[i]] out[0]+=mybase if doublestrandedpos(i, doublezones): out[1]+=findcomp(mybase) else: out[1]+='X' # if not testok(out, mylen, lig) and alreadyseen(out, mylen, lig): # then try again ?? if testok(out, mylen, lig): # take into account whether we've seen this or not # update FIVEs and update SEVENs if len(out[0]) >= 5: myseq = reverse_complement(out[0][-5:]) # print("sequence: " + out[0][-5:] + " has reverse comp: " + myseq) mytrip = strandloc(5,len(out)-1, mylen, lig) mytripcomp = [formcomp(mytrip[0]), mytrip[1],mytrip[2]] # returns something like ['x',13] FIVES[myseq] = [True,mytrip] # reverse complement of last 5 myseq = reverse_complement(out[1][-5:][::-1]) if 'X' not in myseq: FIVES[myseq] = [True,mytripcomp] # ??change this to the beginning and end if len(out[0]) >= 7: mytrip = strandloc(7,len(out)-1, mylen, lig) mytripcomp = [formcomp(mytrip[0]), mytrip[1],mytrip[2]] myseq = reverse_complement(out[0][-7:]) updatesevens(myseq, mytrip ) myseq = reverse_complement(out[1][-7:][::-1]) if 'X' not in myseq: updatesevens(myseq, mytripcomp) i=i+1 # can advance to next base # print("Here is the main strand out[0]") # print(out[0]) # print("Here is the complement strand out[1]") # print(out[1]) return out # find the locations of the beginning and end of an L length # sequence where L is for example 5 or 7 and ends at location i def strandloc(L,indextohere, mylen, lig): startstrandname = lig[int((indextohere-L) / mylen)] endstrandname = lig[int(indextohere / mylen)] locwithinendstrand = indextohere % mylen return ([startstrandname, endstrandname, locwithinendstrand]) # has this location in this strand been seen before? # If so, return the strand name and the base of that position def seen(i,mylen,lig): strandname = lig[int(i / mylen)] locwithinstrand = i % mylen # modulo for o in outstrands: if o[0] == strandname: return [True, o[1][locwithinstrand]] return [False, []] # approximate sevens # given a seven, find all the versions of sevens with either the middle base # or one of the bases on the sides changes def approximateseven(seq): out = [] for let in 'ATGC': out.append(seq[:3] + let + seq[-3:]) out.append(seq[:2] + let + seq[-4:]) out.append(seq[:4] + let + seq[-2:]) return(out) # remove the constraints having to do with the last letter def removeconstraints(out, i): global FIVES if i >= 4: myseq = out[0][-5:] rcmyseq = reverse_complement(myseq) FIVES[rcmyseq] = [False,[]] # reset it myseq = out[1][-5:][::-1] if 'X' not in myseq: rcmyseq = reverse_complement(myseq) FIVES[rcmyseq] = [False,[]] # reset it if i >= 6: myseq = out[0][-7:] rcmyseq = reverse_complement(myseq) resetsevens(rcmyseq) myseq = out[1][-7:][::-1] if 'X' not in myseq: rcmyseq = reverse_complement(myseq) resetsevens(rcmyseq) # given a seven, update it and all its neighbors (different by one # either in the center or one on either side) def updatesevens(seq, mytrip): global SEVENS toupdate = approximateseven(seq) for u in toupdate: SEVENS[u] = [True, mytrip] # given a seven, reset it and all its neighbors (different by one # either in the center or one on either side) def resetsevens(seq): global SEVENS toreset = approximateseven(seq) for r in toreset: SEVENS[r] = [False, []] # if we've already seen these particular instances of 7s, then did they # come from the same sequences at the same positions def sevenscheck(seq, mytrip7): global SEVENS tocheck = approximateseven(seq) for c in tocheck: if SEVENS[c][0] if not SEVENS[c][1] == mytrip7: # some other triple is involved return True return False # find the complement of a base def findcomp(mybase): if mybase == 'A': return('T') if mybase == 'T': return('A') if mybase == 'C': return('G') if mybase == 'G': return('C') return(mybase) # determine whether i is in a doublezone def doublestrandedpos(i, doublezones): for d in doublezones: if (i >= d[0]) and (i <= d[1]): return True return False # determine whether the last base of a pre-existing sequence causes any problems # Pre-existing sequences are given to us. def testinitial_ok(out): L = len(out) # first take care of restricted sequences for r in full_restricted_seq: if L >= len(r): if r == out[-len(r):]: # print( "Forward is restricted") return False # Now look for palindromes if length of the strand is at least 4 if (len(out) > 4) and multipalindrome_check(out, 5): print( "Palindrome check for palindromes 5 or greater.") return False return True # determine whether the last base added to out (and its complement) # causes any issues. # see whether last base added (and possibly its complement) are ok def testok(out, mylen, lig): global FIVES myindex = len(out) - 1 mytriple5 = strandloc(5,myindex, mylen, lig) # triple for 5 for current ligation mytriple7 = strandloc(7,myindex, mylen, lig) # triple for 7 L = len(out[0]) myrevcomp = out[1][::-1] # first take care of restricted sequences for r in full_restricted_seq: if L >= len(r): if r == out[0][-len(r):]: # print( "Forward is restricted") return False if r == myrevcomp[:len(r)]: # print( "Rev complemented is restricted") return False # Now look for palindromes if length of the strand is at least 4 if (len(out[0]) > 4) and multipalindrome_check(out[0], 5): print( "Palindrome check for palindromes 5 or greater.") return False # now take care of FIVES and SEVENS if (len(out[0])) >= 5: myseq = reverse_complement(out[0][-5:]) if FIVES[myseq][0]: # this has already been used previoustrip5 = FIVES[myseq][1] if not previoustrip5 == mytrip5: # was it used by the same strand(s) at the same positions? # print( "FIVEs check forward") return False myseq = reverse_complement(myrevcomp[:5]) if (('X') not in myseq) and FIVES[myseq][0]: previoustrip5 = FIVES[myseq][1] if not previoustrip5 == mytrip5: # if we haven't seen this yet # print( "FIVEs check backwards") return False if (len(out[0])) >= 7: myseq = reverse_complement(out[0][-7:]) if sevenscheck(myseq, mytrip7): # there is a match of myseq or some neighbor # print( "SEVENs check forward") return False myseq = reverse_complement(myrevcomp[:7]) if (('X') not in myseq) and sevenscheck(myseq, mytrip7): # print( "SEVENs check backwards") return False return True # form all sequences assuming each atomic sequence is of length mylen # This handles the disjoint ligations only. def formseqs(mylen): global outstrands # first take care of disjointligations for i in range(len(disjointligations)): lig = disjointligations[i] print('lig is: ') print(', '.join(lig)) totlen = mylen * len(lig) doublezones = [] doublezoneloc = 0 for x in lig: if hascomp(x): doublezones.append([doublezoneloc, doublezoneloc + mylen - 1]) doublezoneloc += mylen ret = addnewstrand(totlen, doublezones, mylen, lig) if 0 < len(ret[0]): doublezoneloc = 0 for x in lig: outstrands.append([x, ret[0][doublezoneloc:doublezoneloc+mylen]]) if hascomp(x): outstrands.append([formcomp(x), (ret[1][doublezoneloc:doublezoneloc+mylen])[::-1]]) doublezoneloc+=mylen else: # nothing returned from addnewstrand print("New ligation: ") print(', '.join(lig)) print ("failed to form.") # ??? UNUSED # Now take care of overlapligations and unligated strands # Somehow, we have to gather the previously used strands and add them in. # As we do that, we do not have to recheck the FIVES and SEVENS till we get # to the border regions. # Complements will already be taken care of. def formseqs_overlap(mylen): global outstrands # first take care of overlapligations for i in range(len(overlapligations)): lig = overlapligations[i] print('lig is: ') print(', '.join(lig)) totlen = mylen * len(lig) doublezones = [] doublezoneloc = 0 for x in lig: if hascomp(x): doublezones.append([doublezoneloc, doublezoneloc + mylen - 1]) doublezoneloc += mylen ret = addnewstrand(totlen, doublezones, mylen, lig) if 0 < len(ret[0]): doublezoneloc = 0 for x in lig: outstrands.append([x, ret[0][doublezoneloc:doublezoneloc+mylen]]) if hascomp(x): outstrands.append([formcomp(x), (ret[1][doublezoneloc:doublezoneloc+mylen])[::-1]]) doublezoneloc+=mylen else: # nothing returned from addnewstrand print("New ligation: ") print(', '.join(lig)) print ("failed to form.") print("Now take care of unligated strands") for s in allstrands: donestrands = [row[0] for row in outstrands] if s not in donestrands: print('strand is: ' + s) doublezones = [] if hascomp(s): doublezones.append([0, mylen - 1]) ret = addnewstrand(mylen, doublezones, mylen, [s]) # only a single strand if 0 < len(ret[0]): outstrands.append([s, ret[0][0:mylen]]) if hascomp(s): outstrands.append([formcomp(s), (ret[1][0:mylen])[::-1]]) else: print("Strand: " + s + " could not be formed") return(outstrands) # is the reverse complement of x in allstrands def hascomp(x): posscomp = x + 'comp' if posscomp in allstrands: return(True) if 4 < len(x): if (x[-4:] == 'comp') and (x[:-4] in allstrands): return(True) return(False) # form the complement name def formcomp(x): posscomp = x + 'comp' if posscomp in allstrands: return(posscomp) if 4 < len(x): if (x[-4:] == 'comp') and (x[:-4] in allstrands): return(x[:-4]) return(False) # hasintersect takes one array lig and sees whether any element in that array # is the same as any element of any array in other def hasintersect(lig, other): lig_set = set(lig) for ot in other: if 0 < len(lig_set.intersection(set(ot))): return True return False # DATA FIVES = {} genfives() SEVENS = {} gensevens() outstrands = [] # strands already taken care of formRestrictedSequences() # adds in some new restrictions preexistingseqs = [] # preexistingseqs.append('TGCAGCCATGCCGCGTGTATGAAGAAG') if 0 < len(preexistingseqs): for p in preexistingseqs: print "taking care of preexisting sequence: ", p # first check for restricted sequences and palindromes growingstring = '' for i in range(0,len(p)): growingstring += p[i] if not testinitial_ok(growingstring): print "ERROR IN PRE-EXISTING STRING PREFIX: ", growingstring # assert False # first the fives for i in range(0,len(p)-4): s = p[i:i+5] # print "s is: ", s FIVES[s] = [True, []] # ?? Not sure whether we need to know beginning and end for i in range(0,len(p)-6): s = p[i:i+7] # print "s is: ", s SEVENS[s] = [True,[]] # ?? Not sure whether we need to know beginning and end # print(sorted(full_restricted_seq)) print("User must set the variables allstrands and ligations near the very bottom of the dnacompiler.py file.") allstrands = [ 'x', 'xcomp', 'y', 'z', 'zcomp', 'w', 'wcomp', 'v',] ligations = [['x','y','z'], ['wcomp', 'v'], ['x', 'v', 'q'], ['a', 'b']] for lig in ligations: for x in lig: if x not in allstrands: allstrands.append(x) print "allstrands is: ", allstrands # ligations no longer need to be disjoint, but we handle the disjoint # ones first and then check about the others. disjointligations = [] overlapligations = [] for lig in ligations: if hasintersect(lig, disjointligations): overlapligations.append(lig) else: disjointligations.append(lig) print "disjointligations: ", disjointligations print "overlapligations: ", overlapligations # have to infer the overlap # loop over lengths and try to extend. mylen = 14 outstrands = [] x = formseqs(mylen) # This routine does all the work for all ligations print("outstrands :") for s in outstrands: print(s) donestrands = [row[0] for row in outstrands] for lig in ligations: print('_'.join(lig)) out = '' for x in lig: i = donestrands.index(x) out+= outstrands[i][1] print(out) # gather statistics fivecount = 0 fiveout = [] for x in FIVES: if FIVES[x][0]: # ?? Not sure whether we need to know beginning and end fivecount+=1 fiveout.append(x) # print("fivecount: " + str(fivecount)) # print(sorted(fiveout)) sevencount = 0 for x in SEVENS: if SEVENS[x][0]: # ?? Not sure whether we need to know beginning and end sevencount+=1 # print("sevencount: " + str(sevencount))