# Try to handle redundant ligations. # The strategy will be to look just at the sevens at the interfaces # and at the five at the interfaces. There should be no conflicts # with the strands themselves, but possibly with others. # UTILS import random full_restricted_seq = ['AAAA', 'TTTT', 'CCC', 'GGG', 'GGGG', 'CCCC']# + palindromic septameric units + palindromic pentameric units + alternating pyurine + pyrimidine def formRestrictedSequences(): pyrimadines = 'TC' purines = 'AG' lets = 'ACGT' #alternating purine - pyrimidines for pur1 in purines: for pyr1 in pyrimadines: for pur2 in purines: for pyr2 in pyrimadines: for pur3 in purines: for pyr3 in pyrimadines: full_restricted_seq.append(pur1 + pyr1 + pur2 + pyr2 + pur3 + pyr3) full_restricted_seq.append(pyr1 + pur1 + pyr2 + pur2 + pyr3 + pur3) def reverse_complement(s): """ get reverse complement of given sequence :param s: :return: """ rev = s[::-1] return complement(rev) def complement(s): """ get complement of given sequence :param s: :return: """ out = '' for let in s: out+= findcomp(let) return out def random_base(): """ get one random base :return: char """ baselist = ['A', 'T', 'C', 'G'] return random.choice(baselist) def random_sequence(length): """ get random sequence with given length :param length: :return: str """ seq = "" for i in range(1, length): seq += random_base() return seq def palindrome_check(seq): """ check if given sequence is palindrome :param seq: :return: bool """ # print("palindrome: " + seq + " = "+ str((seq == reverse_complement(seq)))) if "o" in seq: return False return seq == reverse_complement(seq) # check for palindromes from the last k on forward # returns True is there is a palindrome and False if not def multipalindrome_check(seq, k): i = k while(i < len(seq)): myseq = seq[-i:] if palindrome_check(myseq): return True i+= 1 return False # UNUSED def get_new_restriction_score(e_, seq, new_base): score = 0 new_seq = seq + new_base seq_len = len(new_seq) #restricted seq check for res in e_.full_restricted_seq: res_length = len(res) if(seq_len >= res_length): if new_seq[seq_len-res_length:] == res: score += 1 #palindrome check if(seq_len >= 4): six = new_seq[-4:] if e_.palindrome_check(six): score += 2 if six in seq: score += 3 return score def genfives(): """ generates all possible non restricted five nucleotide units as key and values = FALSE into global FIVES variable :return: None """ global FIVES lets = 'ACGT' for i1 in lets: for i2 in lets: for i3 in lets: for i4 in lets: for i5 in lets: new_five = i1 + i2 + i3 + i4 + i5 restricted = False # if palindrome_check(new_five): # print(new_five) # for sequence in {'AAAA', 'TTTT', 'CCC', 'GGG'}: # if sequence in new_five: # restricted = True # print(new_five + " is restricted.") # break # only add if not restricted if not restricted: FIVES[new_five] = False # print("Length of FIVES: " + str(len(FIVES)) ) def gensevens(): """ generates all possible non restricted seven nucleotide units as key and value = FALSE into global SEVENS variable :return: None """ global SEVENS global RESTRICTED_SEQ lets = 'ACGT' for i1 in lets: for i2 in lets: for i3 in lets: for i4 in lets: for i5 in lets: for i6 in lets: for i7 in lets: new_seven = i1 + i2 + i3 + i4 + i5 + i6 + i7 restricted = False if not restricted: SEVENS[new_seven] = False # APPLICATION-SPECIFIC # mylen is the total length of a strand which could be a ligation # doublezones is a sequence of indices during which the reverse complement # is also supposed to be made. # For example there could be x[y/~y]z[w/~w]. # In that case, if x, y, z, w are supposed to all be of length 20 say, # then doublezones would be [[20,39],[60,79]]. # For simplicity of bookeeping, this will always be created as a # doublestrand so there will be two arrays with array 0 being the primary # and 1 being the reverse complement. When the reverse complement has an X, # that means there is no nucleotide so that no constraint is violated. # For example, at the end, if the two arrays have something like # array 0: ACTAACCTTGG # array 1: XXXXTGGAAXX # Then we have ACTA, ACCTT, GG from the top and AAGGT # The AAGGT is the 5'-3' version of TGGAA # Besides the two arrays, we want to keep track of which # other sequences we've eliminated (e.g. reverse complements of 5-letter # sequences; reverse complements with one mismatch of 7-letter sequences). # We keep track of this in two dictionaries. # If we have to undo a choice of a base, then we must undo the effects on all # those dictionaries. # We must also be sure to advance the base to the next one for that position. # If not available, then we must undo again. # If we undo again. def addnewstrand(mylen, doublezones): out = ["", ""] # primary is location 0 orderindex = [] # index of letter at that position # (if get beyond 3, then must retreat by 1). order = (["A","T","G","C"]) random.shuffle(order) # This will be the order we will use throughout i = 0 while(i < mylen): if i < len(orderindex): # we already have reached this location assert (i+1) == len(orderindex) foundFlag = False while not foundFlag: if orderindex[i] < 3: removeconstraints(out, i) # remove constraints on FIVE and SEVEN out[0] = out[0][:-1] out[1] = out[1][:-1] orderindex[i]+= 1 # can try next letter foundFlag = True elif 0 < len(orderindex): removeconstraints(out, i) # remove constraints on FIVE and SEVEN out[0] = out[0][:-1] out[1] = out[1][:-1] orderindex = orderindex[:-1] i = i-1 elif 0 == len(orderindex): print "Cannot add this strand." return([[],[]]) else: print "We should never get here" return([[],[]]) elif i == len(orderindex): # we have never reached this location orderindex.append(0) # At this point, we are at a certain index i and orderindex[i] # is a particular base. mybase = order[orderindex[i]] out[0]+=mybase if doublestrandedpos(i, doublezones): out[1]+=findcomp(mybase) else: out[1]+='X' if testok(out): # update FIVEs and update SEVENs if len(out[0]) >= 5: myseq = reverse_complement(out[0][-5:]) # print("sequence: " + out[0][-5:] + " has reverse comp: " + myseq) FIVES[myseq] = True # reverse complement of last 5 is taken myseq = reverse_complement(out[1][-5:][::-1]) if 'X' not in myseq: FIVES[myseq] = True if len(out[0]) >= 7: myseq = reverse_complement(out[0][-7:]) updatesevens(myseq) myseq = reverse_complement(out[1][-7:][::-1]) if 'X' not in myseq: updatesevens(myseq) i=i+1 # can advance to next base # print("Here is the main strand out[0]") # print(out[0]) # print("Here is the complement strand out[1]") # print(out[1]) return out # approximate sevens # given a seven, find all the versions of sevens with either the middle base # or one of the bases on the sides changes def approximateseven(seq): out = [] for let in 'ATGC': out.append(seq[:3] + let + seq[-3:]) out.append(seq[:2] + let + seq[-4:]) out.append(seq[:4] + let + seq[-2:]) return(out) # remove the constraints having to do with the last letter def removeconstraints(out, i): global FIVES if i >= 4: myseq = out[0][-5:] rcmyseq = reverse_complement(myseq) FIVES[rcmyseq] = False # reset it myseq = out[1][-5:][::-1] if 'X' not in myseq: rcmyseq = reverse_complement(myseq) FIVES[rcmyseq] = False # reset it if i >= 6: myseq = out[0][-7:] rcmyseq = reverse_complement(myseq) resetsevens(rcmyseq) myseq = out[1][-7:][::-1] if 'X' not in myseq: rcmyseq = reverse_complement(myseq) resetsevens(rcmyseq) # given a seven, update it and all its neighbors (different by one # either in the center or one on either side) def updatesevens(seq): global SEVENS toupdate = approximateseven(seq) for u in toupdate: SEVENS[u] = True # given a seven, reset it and all its neighbors (different by one # either in the center or one on either side) def resetsevens(seq): global SEVENS toreset = approximateseven(seq) for r in toreset: SEVENS[r] = False def sevenscheck(seq): global SEVENS tocheck = approximateseven(seq) for c in tocheck: if SEVENS[c]: return True return False # find the complement of a base def findcomp(mybase): if mybase == 'A': return('T') if mybase == 'T': return('A') if mybase == 'C': return('G') if mybase == 'G': return('C') return(mybase) # determine whether i is in a doublezone def doublestrandedpos(i, doublezones): for d in doublezones: if (i >= d[0]) and (i <= d[1]): return True return False # determine whether the last base of a pre-existing sequence causes any problems # Pre-existing sequences are given to us. def testinitial_ok(out): L = len(out) # first take care of restricted sequences for r in full_restricted_seq: if L >= len(r): if r == out[-len(r):]: print( "Initial strand is restricted up to this point") return False # Now look for palindromes if length of the strand is at least 10??? if (len(out) > 4) and multipalindrome_check(out, 5): print( "Palindrome check for palindromes 5 or greater.") return False return True # determine whether the last base added to out (and its complement) # causes any issues. # see whether last base added (and possibly its complement) are ok def testok(out): global FIVES L = len(out[0]) myrevcomp = out[1][::-1] # first take care of restricted sequences for r in full_restricted_seq: if L >= len(r): if r == out[0][-len(r):]: # print( "Forward is restricted") return False if r == myrevcomp[:len(r)]: # print( "Rev complemented is restricted") return False # Now look for palindromes if length of the strand is at least 10??? if (len(out[0]) > 4) and multipalindrome_check(out[0], 5): print( "Palindrome check for palindromes 5 or greater.") return False # now take care of FIVES and SEVENS if (len(out[0])) >= 5: myseq = reverse_complement(out[0][-5:]) if FIVES[myseq]: # print( "FIVEs check forward") return False myseq = reverse_complement(myrevcomp[:5]) if (('X') not in myseq) and FIVES[myseq]: # print( "FIVEs check backwards") return False if (len(out[0])) >= 7: myseq = reverse_complement(out[0][-7:]) if sevenscheck(myseq): # there is a match of myseq or some neighbor # print( "SEVENs check forward") return False myseq = reverse_complement(myrevcomp[:7]) if (('X') not in myseq) and sevenscheck(myseq): # print( "SEVENs check backwards") return False return True # form all sequences assuming each atomic sequence is of length mylen def formseqs(mylen): outstrands = [] # first take care of ligations for lig in ligations: print('lig is: ') print(', '.join(lig)) totlen = mylen * len(lig) doublezones = [] doublezoneloc = 0 for x in lig: if hascomp(x): doublezones.append([doublezoneloc, doublezoneloc + mylen - 1]) doublezoneloc += mylen ret = addnewstrand(totlen, doublezones) if 0 < len(ret[0]): doublezoneloc = 0 for x in lig: outstrands.append([x, ret[0][doublezoneloc:doublezoneloc+mylen]]) if hascomp(x): outstrands.append([formcomp(x), (ret[1][doublezoneloc:doublezoneloc+mylen])[::-1]]) doublezoneloc+=mylen else: # nothing returned from addnewstrand print("Ligation: ") print(', '.join(lig)) print("failed to form.") print("Now take care of unligated strands") for s in allstrands: donestrands = [row[0] for row in outstrands] if s not in donestrands: print('strand is: ' + s) doublezones = [] if hascomp(s): doublezones.append([0, mylen - 1]) ret = addnewstrand(mylen, doublezones) if 0 < len(ret[0]): outstrands.append([s, ret[0][0:mylen]]) if hascomp(s): outstrands.append([formcomp(s), (ret[1][0:mylen])[::-1]]) else: print("Strand: " + s + " could not be formed") return(outstrands) # is the reverse complement of x in allstrands def hascomp(x): posscomp = x + 'comp' if posscomp in allstrands: return(True) if 4 < len(x): if (x[-4:] == 'comp') and (x[:-4] in allstrands): return(True) return(False) # form the complement name def formcomp(x): posscomp = x + 'comp' if posscomp in allstrands: return(posscomp) if 4 < len(x): if (x[-4:] == 'comp') and (x[:-4] in allstrands): return(x[:-4]) return(False) # DATA FIVES = {} genfives() SEVENS = {} gensevens() formRestrictedSequences() # adds in some new restrictions preexistingseqs = [] preexistingseqs.append('TGCAGCCATGCCGCGTGTATGAAGAAG') if 0 < len(preexistingseqs): for p in preexistingseqs: print "taking care of preexisting sequence: ", p # first check for restricted sequences and palindromes growingstring = '' for i in range(0,len(p)): growingstring += p[i] if not testinitial_ok(growingstring): print "ERROR IN PRE-EXISTING STRING PREFIX: ", growingstring # assert False # first the fives for i in range(0,len(p)-4): s = p[i:i+5] # print "s is: ", s FIVES[s] = True for i in range(0,len(p)-6): s = p[i:i+7] # print "s is: ", s SEVENS[s] = True # print(sorted(full_restricted_seq)) print("User must set the variables allstrands and ligations near the very bottom of the dnacompiler.py file.") allstrands = [ 'x', 'xcomp', 'y', 'z', 'zcomp', 'w', 'wcomp', 'v'] ligations = [['x','y','z'], ['wcomp', 'v']] # ligations must be disjoint # allstrands = [ 'x', 'xcomp'] # ligations = [['x']] # ligations must be disjoint # have to infer the overlap # loop over lengths and try to extend. outstrands = formseqs(14) # This routine does all the work. print("outstrands:") for s in outstrands: print(s) donestrands = [row[0] for row in outstrands] for lig in ligations: print('_'.join(lig)) out = '' for x in lig: i = donestrands.index(x) out+= outstrands[i][1] print(out) # gather statistics fivecount = 0 fiveout = [] for x in FIVES: if FIVES[x]: fivecount+=1 fiveout.append(x) # print("fivecount: " + str(fivecount)) # print(sorted(fiveout)) sevencount = 0 for x in SEVENS: if SEVENS[x]: sevencount+=1 # print("sevencount: " + str(sevencount))