diff --git a/fragmentGenome.py b/fragmentGenome.py new file mode 100644 index 0000000000000000000000000000000000000000..5976ba9ce6700dfe9c221376f5c5d65e87914772 --- /dev/null +++ b/fragmentGenome.py @@ -0,0 +1,146 @@ +import argparse, random +from Bio import SeqIO + +parser = argparse.ArgumentParser(description="") +parser.add_argument("-ge", "--genome", help="Contigs in fasta format", type=str, required=True) +parser.add_argument("-out", "--o", help="output", type=str, required=True) +args = parser.parse_args() + + +faIn = open(args.genome) +faOut = open(args.o, "w") + +minLen=2000 +maxLen=50000 +maxOL=-5000 +maxDist=5000 + +nFragments=500 +curStart=0 +it=0 +misjoins=20 +ins=20 +invs=20 +dels=20 + + + +gap=0 +for rec in SeqIO.parse(faIn, "fasta"): + maxLen=len(rec.seq) + curStart=0 + adjusts=[] + randInts=[] + randGap=[] + for i in range(0,nFragments-1): + random.seed(i*5) + randInts.append(random.randint(0, maxLen)) + randGap.append(random.randint(-5000, 5000)) + + sortedInts=sorted(randInts) + print(sortedInts) + print(randGap) + gap=0 + start=0 + it=0 + mjCounter=0 + insCounter=0 + delCounter=0 + invCounter=0 + for i in range(0, len(randInts)): + end=sortedInts[i] + reverse=random.randint(0,1) + + + + if reverse == 0: + frag=str(rec.seq[start:end]) + fragID=">Frag%i_%i-%i_len(%i)"%(it, start, end, len(frag)) + else: + frag=str(rec.seq[start:end].reverse_complement()) + fragID=">Frag%i_%i-%i_len(%i)_rev"%(it, start, end, len(frag)) + + newMisjoin=random.randint(0,1) + if newMisjoin == 1 and mjCounter<misjoins: + print("add misjoin") + mjLen=(random.randint(1000, 20000)) + mjstart=(random.randint(0, maxLen-mjLen)) + mjID="MISJOIN_%i-%i_len(%i)"%(mjstart, mjstart+mjLen, mjLen) + mjSeq=str(rec.seq[mjstart:mjstart+mjLen]) + + fragID+=mjID + frag+=mjSeq + mjCounter+=1 + + if len(frag) > 10000: + newIns=random.randint(0,5) + if newIns == 1 and insCounter<ins: + print("add ins") + insLen=(random.randint(1000, 5000)) + insstart=(random.randint(0, maxLen-insLen)) + insseq=str(rec.seq[insstart:insstart+insLen]) + inspoint=random.randint(1000, len(frag)-1000) + frag=frag[:inspoint] + insseq + frag[inspoint:] + fragID+="_INS_%i-%i"%(inspoint, inspoint+insLen) + insCounter+=1 + else: + newDel=random.randint(0,5) + if newDel == 1 and delCounter<dels: + print("add del") + delLen=(random.randint(1000, 5000)) + delpoint=random.randint(1000, len(frag)-1000) + frag=frag[:delpoint] + frag[delpoint+delLen:] + fragID+="_DEL_%i(%i)"%(delpoint, delLen) + delCounter+=1 + + + + gap=randGap[i] + faOut.write("%s\n%s\n"%(fragID, frag)) + start=end+gap + it+=1 + end=randInts[i]-gap + reverse=random.randint(0,1) + if reverse == 0: + frag=str(rec.seq[start:]) + fragID=">Frag%i_%i-%i_len(%i)"%(it, start, len(rec.seq), len(frag)) + else: + frag=str(rec.seq[start:].reverse_complement()) + fragID=">Frag%i_%i-%i_len(%i)_rev"%(it, start, len(rec.seq), len(frag)) + gap=randGap[i] + faOut.write("%s\n%s\n"%(fragID, frag)) + +''' +for rec in SeqIO.parse(faIn, "fasta"): + curLen=random.randint(minLen, maxLen) + print(rec.id) + inLen=len(rec.seq)-1 + while curStart+curLen < inLen: + reverse=random.randint(0, 1) + misjoin=random.randint(0, 10) + nextGAP=random.randint(maxOL, maxDist) + if misjoin==0: + print("do misjoin") + misstart=random.randint(0, len(rec.seq)-5001) + misend=misstart+5000 + misFrag=str(rec.seq[misstart:misend]) + misID="_MISJOIN_Frag%i_%i-%i"%(it, misstart, misend) + else: + misFrag="" + misID="" + if reverse==0: + curSeq=str(rec.seq[curStart:curStart+curLen])+misFrag + curID=">Frag%i_%i-%i"%(it, curStart, curStart+curLen)+misID + elif reverse==1: + curSeq=str(rec.seq[curStart:curStart+curLen].reverse_complement())+misFrag + curID=">Frag%i_%i-%i_reversed"%(it, curStart, curStart+curLen)+misID + faOut.write("%s\n%s\n"%(curID, curSeq)) + + + + curStart+=curLen + nextGAP + curLen=random.randint(minLen, maxLen) + it+=1 + +faIn.close() +'''