diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..429e53a4cba07b74109b551238d57b2f1d8797a1 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "minimap2"] + path = minimap2 + url = https://github.com/lh3/minimap2 + branch = master diff --git a/README.md b/README.md index 9463379dc30f8900695a513113e7a0eccefcf115..ff1375c7328f2340cd26d67bf2d2cac87821c346 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,73 @@ # masterTool +## Tool to Scaffold and Quality Control genome assemblies +<p align="center"> +<img src="graphics/simple_workflow.svg"> +</p> + +### Dependencies/Prerequisites: +1. Python3 (3.6.6) +→ script is executable (default python3 on machine) +→ should be downward compatible (Python2.6) +2. Biopython needs to be set up for used python version +→ pip/pip3 install biopython +3. NCBI-BLAST+ package (blastn) needs to be set up GLOBALLY +4. minimap2 set up within the script folder +→ Build on Ubuntu 18.04 might need to be compiled again depending on the machine (“make”) + +### Setup: +Clone and include minimap2 +git clone --recursive https://github.com/mdriller/masterTool.git +Move into minimap2 directoy and build it +cd masterTool/scripts/minimap2/ +make + +### Usage: +If python3 is in path the script can be called like: "./main.py" otherwise just "python main.py" +The help function can be accessed via: ./main.py -h|--help +To run the script needs a genome assembly in fasta format and long reads (e.g. PacBio or ONT) in fastq or fasta format. +>usage: main.py [-h] -ge GENOME -fq FASTQ -out OUTDIR [-m MODE] [-t THREADS] + [-mo MINOVERLAP] [-mi MINIDENT] [-it ITERATIONS] [-s SIZE] + [-fa] + +>>optional arguments: + -h, --help show this help message and exit + -ge GENOME, --genome GENOME + Contigs in fasta format + -fq FASTQ, --fastq FASTQ + Reads in fastq format + -out OUTDIR, --outdir OUTDIR + Directory for output files + -m MODE, --mode MODE Mode for different types of input data data. Options: + pb/ont [default=pb] + -t THREADS, --threads THREADS + Threads to use(during mapping for now...) [default=4] + -mo MINOVERLAP, --minoverlap MINOVERLAP + Minimum overlap needed for two contigs to be merged + [default=100] + -mi MINIDENT, --minident MINIDENT + Minimum identity in overlap needed for two contigs to + be merged [default=90.0] + -it ITERATIONS, --iterations ITERATIONS + maximal iterations performed for breaking set to 0 to + perfrom NO breaking + -s SIZE, --size SIZE Size in which the input reads will be split into + [default=500] + -fa, --fasta Enable if input reads are in fasta format + + Example Usage: + +./main.py -s 500 -ge /home/max/tests/genome.fasta -fq /home/max/tests/pacbioReads.fastq -out /home/max/tests/output -it 3 + +### Output files: + +1. scaffolds.fasta → final scaffolded assembly +2. haplotigs.fasta → removed “haplotigs” or (partially) duplicated contigs +3. adjusted_contigs.fa → set of adjusted contigs (if broken in first iteration) +4. conts_scafs.ids → Map contigIDs back to new scaffoldIDs +5. scaffolds_QC_iti.gff - gff file with the GOODNESS for each assembly i representing the reference used for the current iteration. + +### Workflow +<p align="center"> +<img src="graphics/workflow.svg"> +</p> diff --git a/checkHitlist.py b/checkHitlist.py new file mode 100644 index 0000000000000000000000000000000000000000..25221eeb10cb5ce2649c66c7fa6d5304be69a448 --- /dev/null +++ b/checkHitlist.py @@ -0,0 +1,384 @@ +import globalVars + +# function to compare to fragments on same scaf to check for insertions +# and deletions +def checkINSDELS(hit1, hit2): + curError=None + curCorrect=None + #print("check INS/DELS") + hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + + if hit1.strand == "+" and hit2.strand == "+": + h1_end = min(hit1.t_end + (hit1.q_len - hit1.q_end), hit1.t_len-1) + h2_start = max(hit2.t_start - hit2.q_start, 0) + dist = h2_start - h1_end + diffDIST = dist - hdist + eStart=hit1.t_end + eEnd=hit2.t_start + cstart=hit1.t_start + cend=hit2.t_end + + elif hit1.strand == "-" and hit2.strand == "-": + h1_end = min(hit1.t_start - (hit1.q_len - hit1.q_end), hit1.t_len-1) + h2_start = max(hit2.t_end + hit2.q_start, 0) + dist = h1_end - h2_start + diffDIST = dist - hdist + eEnd=hit1.t_start + eStart=hit2.t_end-1 + cstart=hit1.t_end + cend=hit2.t_start + + if diffDIST >= globalVars.minSVL or diffDIST <= -1*globalVars.minSVL: + #print("Error detected") + #print(hit1) + #print(hit2) + #print(eStart, eEnd, diffDIST) + curError=(hit1.target, min(eStart, eEnd), max(eStart, eEnd), diffDIST) + #if diffDIST < 0: + # print("DELETION!!!") + else: + #print("Correct mapping") + curCorrect=(min(cstart, cend), max(cstart, cend)) + #print(curCorrect) + # print(hit1) + # print(hit2) + # print(curCorrect) + # print(curError) + return curError, curCorrect + + +# identify inversions +def checkINVS(hit1, hit2, oriHASH): + #print("check INVS") + # to store the hits/fragments of inversion --> start and end + #hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + #readOri = oriHASH[hit1.query] + + curError=None + + #if hit2.strand != oriHASH[hit2.query]: + invStart=min(hit1.t_start, hit2.t_start) + invEnd=max(hit1.t_start, hit2.t_start) + curError=(hit1.target, invStart, invEnd, invEnd-invStart) + return curError + + +def calcHitContig_Dist_rev(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_start - (hit1.q_len - hit1.q_end) # rest of contig 1 after extending aln to theoretical end can be negative + h2_start = hit2.t_len - (hit2.t_end + hit2.q_start)# rest of contig 2 after extending aln to theoretical end can be negative + # hdist->=distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if hit1.t_start > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + #print("RevRev") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Dist_pos(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_len - (hit1.t_end + (hit1.q_len - hit1.q_end)) + h2_start = hit2.t_start - hit2.q_start + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print(h2_start, hdist) + #print("add error %s for cont2"%(hit2.target)) + #print(hit2) + err2=True + + #print("PosPos") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Distance_posrev(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_len - (hit1.t_end + (hit1.q_len - hit1.q_end)) + h2_start = hit2.t_len - (hit2.t_end + hit2.q_start) + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if globalVars.contLenDICT[hit1.target]-hit1.t_end > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + #print("PosRev") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Distance_revpos(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_start - (hit1.q_len - hit1.q_end) + h2_start = hit2.t_start - hit2.q_start + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if hit1.t_start > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if hit2.t_start > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + + #print("RevPos") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +# compares 2 hits on different contigs --> adds edges +def connectCONTS(prevhit, curhit, edgeHash): + #print("running connectCONTS") + #print(prevhit) + #print(curhit) + #if alns not at borders add errors for breaking of misassemblies + errors=[] + if prevhit.strand == '+' and curhit.strand == '+': + # nodes to set connections from oriHA + startNode = prevhit.target + "_1" + endNode = curhit.target + "_0" + hDist, e1, e2 = calcHitContig_Dist_pos(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_end-100), min(prevhit.t_len-1, prevhit.t_end+100)) + #curError2=(curhit.target, max(0,curhit.t_start-100), min(curhit.t_len-1,curhit.t_start+100)) + curError1=(prevhit.target, prevhit.t_end+(prevhit.q_len-prevhit.q_end), prevhit.t_end+(prevhit.q_len-prevhit.q_end)) + curError2=(curhit.target, curhit.t_start-curhit.q_start, curhit.t_start-curhit.q_start) + + elif prevhit.strand == '-' and curhit.strand == '-': + # nodes to set connections from + startNode = prevhit.target + "_0" + endNode = curhit.target + "_1" + hDist, e1, e2 = calcHitContig_Dist_rev(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_start-100), min(prevhit.t_len-1,prevhit.t_start+100)) + #curError2=(curhit.target, max(0,curhit.t_end-100), min(curhit.t_len-1,curhit.t_end+100)) + curError1=(prevhit.target, prevhit.t_start+(prevhit.q_start), prevhit.t_start+(prevhit.q_start)) + curError2=(curhit.target, curhit.t_end+(prevhit.q_start), curhit.t_end+(prevhit.q_start)) + + elif prevhit.strand == '+' and curhit.strand == '-': + # nodes to set connections from + startNode = prevhit.target + "_1" + endNode = curhit.target + "_1" + hDist, e1, e2 = calcHitContig_Distance_posrev(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_end-100), min(prevhit.t_len-1,prevhit.t_end+100)) + #curError2=(curhit.target, max(0,curhit.t_end-100), min(curhit.t_len-1,curhit.t_end+100)) + curError1=(prevhit.target, prevhit.t_end+(prevhit.q_len-prevhit.q_end), prevhit.t_end+(prevhit.q_len-prevhit.q_end)) + curError2=(curhit.target, curhit.t_end+(prevhit.q_start), curhit.t_end+(prevhit.q_start)) + + elif prevhit.strand == '-' and curhit.strand == '+': + # nodes to set connections from + startNode = prevhit.target + "_0" + endNode = curhit.target + "_0" + hDist, e1, e2 = calcHitContig_Distance_revpos(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_start-100), min(prevhit.t_len-1,prevhit.t_start+100)) + #curError2=(curhit.target, max(0,curhit.t_start-100), min(curhit.t_len-1,curhit.t_start+100)) + curError1=(prevhit.target, prevhit.t_start+(prevhit.q_start), prevhit.t_start+(prevhit.q_start)) + curError2=(curhit.target, curhit.t_start-curhit.q_start, curhit.t_start-curhit.q_start) + + #if contigs had long overlap left + #if e1==True and prevhit.split==curhit.split-1: + if e1==True: + #print("APPEND MISJOIN", curError1) + #print(prevhit) + #print(curhit) + errors.append(curError1) + #if e2==True and prevhit.split==curhit.split-1: + if e2==True: + #print("APPEND MISJOIN", curError2) + #print(prevhit) + #print(curhit) + errors.append(curError2) + # sort both nodes lexicographically to not have 2 different edges between + # the same nodes e.g. one a->b and b->a in the dict + if startNode < endNode: + node1 = startNode + node2 = endNode + else: + node1 = endNode + node2 = startNode + + + # add edge to "first" node + if not node1 in edgeHash: + edgeHash[node1] = {} + edgeHash[node1][node2] = [hDist] + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1]={} + globalVars.gapDICT[node1][node2]=[curhit.query] + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + else: + if not node2 in edgeHash[node1]: + edgeHash[node1][node2] = [hDist] + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1][node2]=[curhit.query] + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + else: + edgeHash[node1][node2].append(hDist) + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1][node2].append(curhit.query) + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + return errors + + +# general function to loop over each hit list and compare each split to +# the next one --> calls connectCONTs and checkCONT +def checkHitlist(hitlist, edgeGRAPH, oriHASH): + lastHit = "" + # compare 2 hits following each other + for curHits in hitlist: + + #if curHit.target==globalVars.largestContig[0]: + #print("hit on largest contig found --> add coverage") + # addLongestCOV(curHit) + #if len(curHits)>1: + # for curHit in curHits: + # globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) + + if len(curHits)==1: + curHit=curHits[0] + #globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) + #compare two hits + if lastHit!="": + #print("Comparing 2 hits:") + #print(lastHit) + #print(curHit) + + # check hits on same contig --> e.g. continuity + if curHit.target == lastHit.target: + if curHit.strand == lastHit.strand: + #print("Check for INS/DEL") + # check for inseinsertErrorrtions and deletions + cError, cGood= checkINSDELS(lastHit, curHit) + #print("GOOD:", cGood) + #print("BAD:", cError) + + if cError!=None: + if cError[3] > 0: + globalVars.insE[cError[0]].append(cError) + else: + globalVars.delE[cError[0]].append(cError) + else: + globalVars.goodE[curHit.target].append((curHit.target, cGood[0], cGood[1])) + + else: + #print("Check for INV") + # add inversions + cError=checkINVS(lastHit, curHit, oriHASH) + if cError!=None: + globalVars.invE[cError[0]].append(cError) + + # check contig connections --> add edges to graph + else: + #print("Check for CONTIG CONNECTION\n") + cErrors=connectCONTS(lastHit, curHit, edgeGRAPH) + for e in cErrors: + #e[2]+=100 + globalVars.mjE[e[0]].append(e) + + lastHit=curHit + + return edgeGRAPH \ No newline at end of file diff --git a/globalVars.py b/globalVars.py new file mode 100644 index 0000000000000000000000000000000000000000..32c75caa0d54cff2b04db8895bb1019f783c314d --- /dev/null +++ b/globalVars.py @@ -0,0 +1,249 @@ +import sqlite3, subprocess +############################### +#######global variables######## +############################### + +################################### +###classes######################### +################################### + + +class coGRAPH: + + def __init__(self): + self.cGraph={} + + def addContigEdge(self, cID, sLen): + cEdge1 = myEdge(cID + "_0", cID + "_1", sLen, sLen, 0, 'c') + cEdge2 = myEdge(cID + "_1", cID + "_0", sLen, sLen, 0, 'c') + self.cGraph[cID + "_0"] = myCont(cEdge1, "", {}) + self.cGraph[cID + "_1"] = myCont(cEdge2, "", {}) + + def removeNode(self, cID): + del self.cGraph[cID] + print("%s had been removed"%(cID)) + + #function to return distance between node a and b if there is a path + def returnDist(self, a, b, circle=False): + visited=[] + restart=False + #print("getting distance") + #print(a,b) + #print(self.cGraph[a].active) + #print(self.cGraph[b].active) + #print(self.cGraph[a].cEdge) + #print(self.cGraph[b].cEdge) + if a in self.cGraph and b in self.cGraph: + #print("both in graph") + nextN=a + #print(nextN) + #print(self.cGraph[nextN].active) + dist=0 + while self.cGraph[nextN].active != "": + visited.append(nextN) + dist+=self.cGraph[nextN].active.dist + nextN=self.cGraph[nextN].active.target + #print(nextN, b) + if nextN==b: + #print("FOUND IT") + if dist==0: + dist=1 + return(dist) + nextN=self.cGraph[nextN].cEdge.target + #print(nextN) + if nextN in visited: + restart=True + break + visited.append(nextN) + if circle==True: + if nextN==b: + #print("FOUND IT") + if dist==0: + dist=1 + return(dist) + + if restart==True: + #print("CIRCLE DETECTED RESTART") + return self.returnDist(a, b, True) + + return False + + else: + print("wut not both in graph???") + return False + + + # function to find start of a path starting with a node in the path + def findStart(self, anode): + inPath=[] + #print("Finding start for node %s"%(anode)) + # just to make sure to alayws start at 0 end of contig and then go to the + # left from there + if anode.endswith("0"): + cN = anode + elif anode.endswith("1"): + cN = self.cGraph[anode].cEdge.target + else: + print("ERROR incorrect scaffold name found %s" %(anode)) + + inPath.append(cN[:-2]) + + while self.cGraph[cN].active != "": + cN = self.cGraph[cN].active.target + #print(cN) + if cN[:-2] in inPath: + print("CIRCLE FOUND!!!!") + #print(self.cGraph[cN].cEdge) + #print(self.cGraph[cN].active) + return cN + inPath.append(cN[:-2]) + # move to other end of the contig for possible other read edge --> + # otherwse endless loop + # print(cN) + cN = self.cGraph[cN].cEdge.target + #print("Start of path found: %s"%(cN)) + return cN + + def getPath(self, anode): + inPath=[] + startNode=self.findStart(anode) + curNode=startNode + while 1: + if curNode in inPath: + print("getPath - Circle found!!!") + exit() + + inPath.append(curNode) + curNode=self.cGraph[curNode].cEdge.target + inPath.append(curNode) + + if self.cGraph[curNode].active == "": + print("End of path reached") + return inPath + else: + curNode=self.cGraph[curNode].active.target + + + + + # returns an alternating path of edges + def returnEgdePath(self, anode): + inPath=[] + ePath=[] + pHash={} + curDist=0 + startNode = self.findStart(anode) + curNode=startNode + inPath.append(startNode[:-2]) + pHash[startNode]=(curDist, 'a') + ePath.append(self.cGraph[curNode].cEdge) + curDist+=self.cGraph[curNode].cEdge.dist + curNode=self.cGraph[curNode].cEdge.target + + while self.cGraph[curNode].active != "": + pHash[curNode]=(curDist, 'b') + ePath.append(self.cGraph[curNode].active) + curDist+=self.cGraph[curNode].active.dist + curNode=self.cGraph[curNode].active.target + pHash[curNode]=(curDist, 'a') + ePath.append(self.cGraph[curNode].cEdge) + curDist+=self.cGraph[curNode].cEdge.dist + curNode=self.cGraph[curNode].cEdge.target + + if not curNode[:-2] in inPath: + inPath.append(curNode[:-2]) + + return ePath, pHash + + + +class myCont: + + def __init__(self, cEdge, active, eList): + self.cEdge=cEdge + self.active=active + self.eList=eList + + +# class for an edge +class myEdge: + + def __init__(self, start, target, weight, dist, stdev, direct): + self.start = start + self.target = target + self.weight = weight + self.dist = dist + self.stdev = stdev + self.direct = direct + + # comparing edes will comapre their weights + def __lt__(self, other): + return self.weight < other.weight + + def __gt__(self, other): + return self.weight > other.weight + + # printing function + def __str__(self): + return("\t%s, %s, weight:%i, dist:%i, stdev:%f, %s" % + (self.start, self.target, self.weight, self.dist, self.stdev, self.direct)) + + +# class for outputting scaffolds in the end +class mySCAF: + + def __init__(self, names, seqs, dirs): + self.names = names + self.seqs = seqs + self.dirs = dirs + + +# function to initialize global variables +def init(out, sL, mL, mO, mI, t, mSV, bPATH, mPATH): + global largestContig, distError, outDir, splitLen, minOverlap, minLinks + global minIdent, threads, contsMerged, contLenDICT, contDICT, mergedDICT, readHash + global contigGR, adjustDICT, gapDICT, eDICT, edgeDICT + global blastCounter, poserrorHASH, minSVL + global blastP, minimap2P + + global insE, delE, invE, mjE, goodE + insE={} + delE={} + invE={} + mjE={} + goodE={} + + blastCounter=0 + poserrorHASH={} + readHash={} + + outDir = out + splitLen = sL + #variable for largest contig + largestContig=("", 0) + #largestMapped=[] + #distance error is margin of read mappings + distError=float(splitLen)*0.15*2 + minLinks = mL + minOverlap = mO + minIdent = mI + threads = t + contsMerged=0 + minSVL=mSV + + contLenDICT = {} + contDICT = {} + # saves the ids of merged contigs + mergedDICT = {} + # contig GRAPH + contigGR = {} + # adjust hash stores edges and saves the new edge to which they should be + # redirected and the new distance that needs to be added to it + adjustDICT = {} + #for errors --> key scafID --> returns reads that need to be mapped back + eDICT = {} + # store readIDs of read spanning gaps + gapDICT={} + # for contig edges implied by hits + edgeDICT={} + diff --git a/gpma.py b/gpma.py new file mode 100644 index 0000000000000000000000000000000000000000..e964d82282198b72ae3935e01d7c2af0e1a8ba76 --- /dev/null +++ b/gpma.py @@ -0,0 +1,456 @@ +import globalVars +import merging +import copy + +################################## +################################## +######functions################### +################################## +################################## + + +# function to set edges in graph +def setEdges(setEdge, agraph): + + + if setEdge.direct!="": + newEdgeStart = globalVars.myEdge(setEdge.start, setEdge.target, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + newEdgeTarget = globalVars.myEdge(setEdge.target, setEdge.start, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + else: + newEdgeStart="" + newEdgeTarget="" + #print(setEdge) + #print("\tLooks good --> Setting edge: %s" % (newEdgeStart)) + #print("\tLooks good --> Setting edge: %s" % (newEdgeTarget)) + + oldEdge1=copy.copy(agraph.cGraph[setEdge.start].active) + oldEdge2=copy.copy(agraph.cGraph[setEdge.target].active) + if oldEdge1=="": + oldEdge1=globalVars.myEdge(setEdge.start, setEdge.target, 0, 0, 0, "") + if oldEdge2=="": + oldEdge2=globalVars.myEdge(setEdge.target, setEdge.start, 0, 0, 0, "") + agraph.cGraph[setEdge.start].active = newEdgeStart + agraph.cGraph[setEdge.target].active = newEdgeTarget + oldEdge=[oldEdge1, oldEdge2] + #print("OLD EDGE") + #print(oldEdge[0]) + #print(oldEdge[1]) + return oldEdge + +def resetEdge(setEdge, agraph): + #print("resetting edge") + #print(setEdge) + if setEdge.direct!="": + newEdgeStart = globalVars.myEdge(setEdge.start, setEdge.target, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + else: + newEdgeStart="" + + agraph.cGraph[setEdge.start].active = newEdgeStart + +#function to adjust the gap dictionary in case +def adjustGAPdict(new1, new2, old1, old2): + + #print("Adjusting gap dict from %s to %s by taking them from %s to %s"%(new1, new2, old1, old2)) + + if old1 < old2: + n1 = old1 + n2 = old2 + else: + n1 = old2 + n2 = old1 + #print(n1, n2) + gapReads=globalVars.gapDICT[n1][n2] + + if new1 < new2: + node1 = new1 + node2 = new2 + else: + node1 = new2 + node2 = new1 + if globalVars.gapDICT.get(node1, "nope")=="nope": + globalVars.gapDICT[node1]={} + globalVars.gapDICT[node1][node2]=[] + elif globalVars.gapDICT[node1].get(node2, "nope")=="nope": + globalVars.gapDICT[node1][node2]=gapReads + else: + globalVars.gapDICT[node1][node2]+=gapReads + +# function to properly remove key from a dictionary +def removeMergedNEW(rkey, newTar, addDist): + + #print("Removing %s from dict"%(rkey)) + #print("Now pointing to %s with dist %i"%(newTar, addDist)) + globalVars.adjustDICT[rkey]=(newTar, addDist) + +def checkMerge(curEdge, cHASH): + #print(curEdge.dist, globalVars.minOverlap*-1) + #print(curEdge.dist <= globalVars.minOverlap*-1) + #print("\tContigs are apparently overlapping - check for overlap for possible merging") + #check if merging is possible and get the real overlap + merged, realOL, mAdjust = merging.runBLAST(curEdge.start, curEdge.target, curEdge.dist, True) + #if cant be merged still set edge with distance 0 + #print("\t"+str(merged)) + if merged==True: + #print("\tsucessfully merged") + #print(curEdge.start, curEdge.target, curEdge.dist, realOL, curEdge.dist*(-1)-realOL*(-1)) + curEdge.dist=realOL + + else: + if merged=="Removed": + #removeMergedNEW(cHASH, mAdjust[0][:-2]+"_0", mAdjust[1], mAdjust[2]) + #removeMergedNEW(cHASH, mAdjust[0][:-2]+"_0", mAdjust[3], mAdjust[4]) + curEdge.dist=realOL + #print("Cont %s was part of the other -> abort for now"%(mAdjust[0][:-2])) + + return merged, mAdjust + +def mergeNEW(nEdge, inGraph): + graph=inGraph.cGraph + sucess=True + edgeList=[] + removal=[] + #print("Start Merging Paths") + inPath=[] + curEdge=copy.copy(nEdge) + end=False + zipper='r' + #zip right + firstL=False + while end==False: + #print("while") + #print(curEdge) + if curEdge.start[:-2] == curEdge.target[:-2]: + #print("\tEdge with itself - skip") + end=True + break + + #if original edge is supposed to overlap contigs check for overlap first + #abort if overlap not there + if curEdge.dist < 0: + merged, eAdjust=checkMerge(curEdge, inGraph) + if merged==False: + if curEdge.dist > (-2)*globalVars.distError: + #print("Merging failed but distance within range --> set to 1N for now") + curEdge.dist=1 + else: + #print("Merging failed!!!") + sucess=False + return sucess, [], [] + if merged == "Removed": + next1=copy.copy(graph[curEdge.start].active) + next2=copy.copy(graph[curEdge.target].cEdge) + next22=copy.copy(graph[next2.target].active) + #edgeList.append(copy.copy(curEdge)) + if curEdge.dist*(-1) >= globalVars.contLenDICT[curEdge.target[:-2]]-20: + #print("Duplicate case1: contig %s is part of %s"%(curEdge.target, curEdge.start)) + if next22=="": + #print("End of path reached") + removal.append(eAdjust) + if graph[curEdge.target].active!="": + curEdge=copy.copy(graph[curEdge.target].active) + #print("Trying to zipper left path from merged cont guide edge:") + #print(curEdge) + else: + #print("nothing left to merge done here") + break + else: + curEdge.start=curEdge.start + curEdge.target=next22.target + curEdge.dist=curEdge.dist + (next2.dist + next22.dist) + #print("Merged implied edge:") + #print(curEdge) + removal.append(eAdjust) + #next iteration + continue + elif curEdge.dist*(-1) >= globalVars.contLenDICT[curEdge.start[:-2]]-20: + #print("Duplicate case2: contig %s is part of %s"%(curEdge.start, curEdge.target)) + if next1=="": + #print("End of path reached") + removal.append(eAdjust) + otherEnd = graph[curEdge.start].cEdge.target + if graph[otherEnd].active!="": + curEdge=copy.copy(graph[otherEnd].active) + #print("Trying to zipper left path from merged cont guide edge:") + #print(curEdge) + else: + #print("nothing left to merge done here") + break + else: + curEdge.start=next2.target + curEdge.target=next1.target + curEdge.dist=next1.dist - curEdge.dist - next2.dist + #print("Merged implied edge:") + #print(curEdge) + removal.append(eAdjust) + #next iteration + continue + else: + print("WTF why removed???") + print(globalVars.contLenDICT[curEdge.start[:-2]]) + print(globalVars.contLenDICT[curEdge.target[:-2]]) + exit() + + #sucess=False + #return sucess, [] + + if firstL==True: + #print("skip first left") + next1 = globalVars.myEdge(edgeList[0].target, edgeList[0].start, edgeList[0].weight, edgeList[0].dist, edgeList[0].stdev, edgeList[0].direct) + firstL=False + else: + next1=copy.copy(graph[curEdge.start].active) + next2=copy.copy(graph[curEdge.target].cEdge) + if next1!="": + next11=copy.copy(graph[next1.target].cEdge) + #print(next1) + #print(next11) + #print(next2) + if curEdge.start == next1.start and curEdge.target == next1.target: + if abs(curEdge.dist - next1.dist) < 3*curEdge.stdev: + sucess=True + break + else: + #print("Egde already exists but distance failed") + next1="" + sucess=False + break + elif next1=="": + #print("End of %s-path reached edge looks good to set"%(zipper)) + edgeList.append(curEdge) + if zipper=='r': + zipper='l' + firstL=True + curEdge=copy.copy(graph[nEdge.target].active) + if curEdge!="": + #print("Checking left path starting with edge") + #print(curEdge) + continue + else: + #print("Original edge had nothing left to merge") + #print(nEdge) + #print(curEdge) + end=True + break + else: + end=True + break + + + #case curEdge+contigEdge of p2 fits in next read edge of p1 + #case1 + #if next1.dist >= curEdge.dist and next1.dist > 0: + if next1.dist >= curEdge.dist: + #adjust gapHASH if new dist > 0 + if (next1.dist-(curEdge.dist+next2.dist)) > (-2)*globalVars.distError: + #print("New dist %i > 0 adjust gapHASH"%(next1.dist-(curEdge.dist+next2.dist))) + adjustGAPdict(next2.target, next1.target, next1.start, next1.target) + #add curEdge to list because it would be set if merging is possible + edgeList.append(copy.copy(curEdge)) + #imply new edge + curEdge.start=next2.target # flip target and start here to continue + curEdge.target=next1.target # flip target and start here to continue + curEdge.dist=next1.dist-(curEdge.dist+next2.dist) + #print("\timplied edge1") + #print(curEdge) + continue + + #case readEdge+ next contigEdge of p1 fits in curEdge + #case2 + #elif curEdge.dist > next1.dist and curEdge.dist > 0: + elif curEdge.dist > next1.dist: + #adjust gapHASH if new dist > 0 + if (curEdge.dist-(next1.dist+next11.dist)) > (-2)*globalVars.distError: + #print("New dist %i > 0 adjust gapHASH"%(curEdge.dist-(next1.dist+next11.dist))) + adjustGAPdict(next11.target, curEdge.target, curEdge.start, curEdge.target) + #imply new edge + curEdge.start=next11.target # no flip needed + curEdge.target=curEdge.target # no flip needed + curEdge.dist=curEdge.dist-(next1.dist+next11.dist) + + #print("\timplied edge2") + #print(curEdge) + continue + + return sucess, edgeList, removal + + # returns an alternating path of edges +def getNodePath(ggraph, anode): + #print("Get node path") + ePath=[] + pHash=[] + curDist=0 + startNode = ggraph.findStart(anode) + curNode=startNode + pHash.append(startNode) + ePath.append(ggraph.cGraph[curNode].cEdge) + curDist+=ggraph.cGraph[curNode].cEdge.dist + curNode=ggraph.cGraph[curNode].cEdge.target + + while ggraph.cGraph[curNode].active != "" and curNode not in pHash: + #print(curNode) + pHash.append(curNode) + ePath.append(ggraph.cGraph[curNode].active) + curNode=ggraph.cGraph[curNode].active.target + #print(curNode) + pHash.append(curNode) + ePath.append(ggraph.cGraph[curNode].cEdge) + curNode=ggraph.cGraph[curNode].cEdge.target + + + pHash.append(curNode) + + return ePath, pHash + + +def checkHappiness(pdict, cDICT): + nH=0 + nUH=0 + #print("Checking happiness of path") + checked=[] + #print(len(pdict)) + for c1 in pdict: + checked.append(c1) + for c2 in pdict: + #print(c1, c2) + # sort both nodes lexicographically to not have 2 different edges between + # the same nodes e.g. one a->b and b->a in the dict + if c1 < c2: + node1 = c1 + node2 = c2 + else: + node1 = c2 + node2 = c1 + + if not c2 in checked: + if node1 in globalVars.edgeDICT: + if node2 in globalVars.edgeDICT[node1]: + dist=cDICT.returnDist(node1, node2) + #print("%s and %s both in list with dist: %i"%(node1, node2, dist)) + curEdges=globalVars.edgeDICT[node1][node2] + #curEdges+=globalVars.edgeDICT[node2][node1] + for curEdge in curEdges: + if curEdge[0] >= globalVars.minLinks: + #print(node1, node2) + #print("comparing distance %i with edge"%(dist)) + #print(curEdge) + if dist != False: + #print("Orientations fit set distance is %i predicted distance %i"%(dist, curEdge[1])) + if abs(curEdge[1] - dist) <= max(3*curEdge[2], 100): #check if within 3*stdev of bundeld edge + #print("Distance fits increasing happy by %i"%(curEdge[0])) + nH+=curEdge[0] + continue + else: + #print("Distance does not fit increasing UNhappy by %i"%(curEdge[0])) + nUH+=curEdge[0] + continue + else: + #print("Incorrect orientation increasing UNhappy by %i"%(curEdge[0])) + nUH+=curEdge[0] + continue + + + return nH, nUH + + +# main function to run gpma +def gpma(newEdge, inGraph): + #print("gpma") + inputEdge=copy.copy(newEdge) + #print(newEdge) + + # check if one edge points to a merged contig --> in that case needs to be + # adjusted + # while loop to go through multiple adjustments + while globalVars.adjustDICT.get(newEdge.start, "nope") != "nope": + #print(inputEdge) + #if globalVars.adjustDICT.get(newEdge.start, "nope") != "nope": + newEdge.dist -= globalVars.adjustDICT[newEdge.start][1] + newEdge.start = globalVars.adjustDICT[newEdge.start][0] + #print(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + adjustGAPdict(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + + #print("edge adjusted to merged edge") + #print(newEdge) + + while globalVars.adjustDICT.get(newEdge.target, "nope") != "nope": + #print(inputEdge) + #if globalVars.adjustDICT.get(newEdge.target, "nope") != "nope": + newEdge.dist -= globalVars.adjustDICT[newEdge.target][1] + newEdge.target = globalVars.adjustDICT[newEdge.target][0] + #print(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + adjustGAPdict(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + #print("edge adjusted to merged edge") + #print(newEdge) + + + inputEdge=copy.copy(newEdge) + #start directions + + #print("Path1:") + #ePath1, pHASH1 = inGraph.returnEgdePath(newEdge.start) + ePath1, pHASH1 = getNodePath(inGraph, newEdge.start) + #print(len(pHASH1)) + #for ele in ePath1: + # print(ele) + nHP1, nUHP1 = checkHappiness(pHASH1, inGraph) + #print("Happiness p1: %i/%i(H/UH)"%(nHP1, nUHP1)) + + #print("Path2:") + #ePath2, pHASH2 = inGraph.returnEgdePath(newEdge.target) + ePath2, pHASH2 = getNodePath(inGraph, newEdge.target) + #print(len(pHASH2)) + #for ele in ePath2: + # print(ele) + nHP2, nUHP2 = checkHappiness(pHASH2, inGraph) + #print("Happiness p2: %i/%i(H/UH)"%(nHP2, nUHP2)) + + result, setList, removed = mergeNEW(inputEdge, inGraph) + + if result==True: + oldEdges=[] + for edge in setList: + oE=setEdges(edge, inGraph) + oldEdges=oE+oldEdges + + #print("New path:") + t=True + for ele in removed: + if ele[0][:-2] == newEdge.start[:-2]: + t=False + if t== True: + newPath, newHASH = getNodePath(inGraph, newEdge.start) + else: + newPath, newHASH = getNodePath(inGraph, newEdge.target) + #print(len(newHASH)) + #for ele in newPath: + # print(ele) + newHP, newUHP = checkHappiness(newHASH, inGraph) + + + #check happiness + if newHP - (nHP1+nHP2) >= newUHP - (nUHP1+nUHP2): + #print("GPMA VERY HAPPY") + for remove in removed: + removeMergedNEW(remove[0][:-2]+"_0", remove[1], remove[2]) + removeMergedNEW(remove[0][:-2]+"_1", remove[3], remove[4]) + else: + #print("GPMA UNHAPPY EDGE SHOULD NOT HAVE BEEN SET") + #[print(e) for e in newPath] + #print("Happiness NEW PATH: %i/%i(H/UH)"%(newHP, newUHP)) + #[print(e) for e in ePath1] + #print("Happiness p1: %i/%i(H/UH)"%(nHP1, nUHP1)) + #[print(e) for e in ePath2] + #print("Happiness p2: %i/%i(H/UH)"%(nHP2, nUHP2)) + for edge in oldEdges: + resetEdge(edge, inGraph) + + + + #else: + # print("GPMA FAILED!!!") + + + diff --git a/main.py b/main.py new file mode 100755 index 0000000000000000000000000000000000000000..83ed55f67074140868c845666eeedf40d6a212c4 --- /dev/null +++ b/main.py @@ -0,0 +1,975 @@ +#!/usr/bin/env python3 + +import argparse, subprocess, sys +import statistics +import datetime +import copy +# import own scripts for stuff +import checkHitlist +from gpma import gpma +import globalVars +import splitReads +import merging + +try: + from subprocess import DEVNULL # py3k +except ImportError: + import os # for python2 + DEVNULL = open(os.devnull, 'wb') +# biopython for now --> for reverse complement function +#from Bio.Seq import Seq + + +############################### +#######global variables######## +############################### + + + +# key is readID(without split) --> save orientation for each read to check +# for inversions later +oriHASH = {} + +################################### +#############classes############### +################################### + + +# class for for each hit within the PAF file +class myHIT: + + def __init__(self, query, split, strand, target, t_len, t_start, t_end, q_len, q_start, q_end, mq): + self.query = query + self.split = split + self.strand = strand + self.target = target + self.t_len = t_len + self.t_start = t_start + self.t_end = t_end + self.q_len = q_len + self.q_start = q_start + self.q_end = q_end + self.mq = mq + + def __str__(self): + return("\t%s, %i, %s, %s, %s, %s, %s, %s, %s, %s, %i" % (self.query, self.split, self.strand, self.target, str(self.t_len), str(self.t_start), str(self.t_end), str(self.q_len), str(self.q_start), str(self.q_end), self.mq)) + # comparing edges will compare their weights + + def __lt__(self, other): + return self.q_start < other.q_start + + def __gt__(self, other): + return self.q_start > other.q_start + +# class for outputting scaffolds in the end +class mySCAF: + + def __init__(self, names, seqs): + self.names = names + self.seqs = seqs + + + +################################### +###functions####################### +################################### + +# function to parse the config file containing the paths to +# blastn and minimap2 +def parsePATHs(): + scriptPath=sys.argv[0].rstrip("main.py") + configReader=open("%s/config.txt"%scriptPath) + minimap2Path="" + blastnPath="" + for line in configReader: + if not line.startswith("#"): + if line.startswith("MINIMAP2"): + minimap2Path = line.strip().split("=")[-1] + elif line.startswith("BLASTN="): + blastnPath = line.strip().split("=")[-1] + if minimap2Path=="": + print("Error no entry for minimap2 path in config.txt found - exiting...") + exit() + if blastnPath=="": + print("Error no entry for blastn path in config.txt found - exiting...") + exit() + configReader.close() + #print("minimap path: %s"%(minimap2Path)) + #print("blastn path: %s"%(blastnPath)) + + return minimap2Path, blastnPath + +#function to reverse complement a sequence +def reverseComplement(seq): + old = "ACGT" + replace = "TGCA" + return seq.upper().translate(str.maketrans(old, replace))[::-1] + +# function to run minimap2 +def runMinimap2(nThreads, mapMode, inFA, inFQ, outPAF): + scriptPath=sys.argv[0].rstrip("main.py") + try: + subprocess.call("%s/minimap2/minimap2 -t %i -x %s %s %s> %s" % + (scriptPath, nThreads, mapMode, inFA, inFQ, outPAF), shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + #subprocess.call("%s/minimap2/minimap2 -t %i -x %s %s %s> %s" % + # (scriptPath, nThreads, mapMode, inFA, inFQ, outPAF), shell=True) + except: + print("Problem running minimap2 - check if its properly set up") + +# function to parse contigs and initialize the contigGraph +def parseContigs(path, cograph): + if path.endswith(".gz"): + import gzip + faParser = gzip.open(path, mode='rt') + else: + faParser = open(path) + # for first iteration/line + seq = "" + for line in faParser: + if line.startswith(">"): + if seq != "": + seqLen = len(seq) + # initalize contigGraph + cograph.addContigEdge(contID, seqLen) + # initalize contLenDICT + globalVars.contLenDICT[contID] = seqLen + globalVars.contDICT[contID] = seq + globalVars.insE[contID]=[] # insertions + globalVars.invE[contID]=[] # inversions + globalVars.delE[contID]=[] # deletions + globalVars.mjE[contID]=[] # misjoins + globalVars.goodE[contID]=[] # continuous mapping hits + if seqLen>=globalVars.largestContig[1]: + globalVars.largestContig=(contID, seqLen) + + contID = line.strip().split(" ")[0][1:] + seq = "" + else: + seq += line.strip() + + faParser.close() + + # for last sequence not covered by loop + seqLen = len(seq) + # initalize contigGraph + cograph.addContigEdge(contID, seqLen) + # initalize globalVars.contLenDICT + globalVars.contLenDICT[contID] = seqLen + globalVars.contDICT[contID] = seq + globalVars.insE[contID]=[] + globalVars.invE[contID]=[] + globalVars.delE[contID]=[] + globalVars.mjE[contID]=[] + globalVars.goodE[contID]=[] + + + +#function to bundle a set of edges between two nodes +#as described in gpma +def bundleEdges(edgeSet, noMed=False): + #print("BUNDELING:") + #print(edgeSet) + nm=False + bundledList=[] + if noMed ==True: # if median picked before clusered nothing take first edge + #print("Taking median failed before so now just pick first edge") + medianEdgeLen=edgeSet[0] + else: + medianEdgeLen = statistics.median(edgeSet) #calc median + #stDev=numpy.std(edgeSet) + #print("Median edge chosen: %i"%(medianEdgeLen)) + #print(stDev, globalVars.distError*2) + newWeight=0 + fitDists=[] + #if edgeSet only has length one to avaoid unnecessary computations + if len(edgeSet) == 1 and globalVars.minLinks<=1: + bundledList=[(1, edgeSet[0], 0)] + edgeSet.remove(edgeSet[0]) + + else: + for dist in edgeSet: + #print(abs(medianEdgeLen - dist), 2*globalVars.distError) + if abs(medianEdgeLen - dist) < 2*globalVars.distError: + newWeight+=1 + fitDists.append(dist) + #print("dist %i fit and was added and removed!"%dist) + # important to remove after loop otherwise it causes the error of skipping some edges + for dist in fitDists: + edgeSet.remove(dist) + if len(fitDists) > 1: + # bundle edges as defined in GPMA + p=sum([dist/((globalVars.distError*2)**2) for dist in fitDists]) + q=sum([1/((globalVars.distError*2)**2) for dist in fitDists]) + newDist=int(p/q) + newSDev=statistics.stdev(fitDists) + newEdge=(newWeight, newDist, newSDev) + #print("bundeled new edge added:") + #print(newEdge) + #print(fitDists) + bundledList.append(newEdge) + elif len(fitDists) == 0: + #print("Bundeling failed") + nm=True + #if globalVars.minLinks<=1: + # bundledList+=[(1, edge, 0) for edge in edgeSet] + # edgeSet=[] + + elif len(fitDists) == 1 and globalVars.minLinks<=1: + newDist=fitDists[0] + bundledList.append((1, fitDists[0], 0)) + + + #newEdge = (edgeWeight, medianEdgeLen) + + + # repeat if still unbundled edges left + if len(edgeSet)>0: + #print("recall bundeling there is still some edges left") + bundledList+=bundleEdges(edgeSet, nm) + #print("\n\n") + return(bundledList) + + +# parses PAF gets hitlists and checks them --> builds edges and bundles them +# returns set of bundled edges +def parsePAF(path): + if path.endswith(".gz"): + import gzip + pafParser = gzip.open(path) + else: + pafParser = open(path) + + # list for all hit of one reads --> e.g. best hit per split part + hitList = [] + posCounts = 0 + revCounts = 0 + + for line in pafParser: + splitted = line.strip().split("\t") + query = splitted[0] + q_len = int(splitted[1]) + q_start = int(splitted[2]) + q_end = int(splitted[3])-1 + strand = splitted[4] + tar = splitted[5] + t_len = int(splitted[6]) + t_start = int(splitted[7]) + t_end = int(splitted[8])-1 + #matching = splitted[9] + #opts = splitted[10] + mq = int(splitted[11]) + #P for primary aln or S for secondary + #alnType=splitted[12].split(":")[-1] + + # get fragment ID + qID = query.split("_")[0] + splitID = int(query.split("_")[-1]) + + curHIT = myHIT(qID, splitID, strand, tar, t_len, + t_start, t_end, q_len, q_start, q_end, mq) + + # take only the best hit for each fragment --> replace if the last one was + + if q_end-q_start>q_len*0.3: + if len(hitList)==0: + hitList.append([curHIT]) + continue + if curHIT.query == hitList[-1][0].query: + #if curHIT.split == lastHit.split: + + if curHIT.split == hitList[-1][0].split: + #print("Mult hit found") + #print(lastHit) + #print(curHIT) + #identical hit + hitList[-1].append(curHIT) + else: + hitList.append([curHIT]) + if curHIT.strand == "+": + posCounts += 1 + elif curHIT.strand == "-": + revCounts += 1 + + else: + # more than 1 hit --> only then it makes sense to check the + # hitlist + if len(hitList) > 1: + #check the hitlist to identify SVs misjoins and edges + checkHitlist.checkHitlist(hitList, globalVars.edgeDICT, oriHASH) + + # start hit list for new read and reset repCounter + hitList = [] + if q_end-q_start>=q_len*0.3: + hitList.append([curHIT]) + if curHIT.strand == "+": + posCounts += 1 + elif curHIT.strand == "-": + revCounts += 1 + + #for last hitlist + checkHitlist.checkHitlist(hitList, globalVars.edgeDICT, oriHASH) + + pafParser.close() + + + print("\t2.1) Bundling Edges") + edgeLIST = [] + # bundleEdges + for cont in globalVars.edgeDICT: + for tCont in globalVars.edgeDICT[cont]: + bundeledEdges=bundleEdges(globalVars.edgeDICT[cont][tCont]) + #print(bundeledEdges) + globalVars.edgeDICT[cont][tCont] = bundeledEdges + + # ignore edges with only weight 1 + acceptedEdges=[] + for edge in globalVars.edgeDICT[cont][tCont]: + + #print(edge) + if edge[0] >= globalVars.minLinks: + + #check if overlap so large it indicates a merge + if edge[1]*-1 >= globalVars.contLenDICT[cont[:-2]] or edge[1]*-1 >= globalVars.contLenDICT[tCont[:-2]]: + #print("overlap so large it indicates a merge") + if not edge[1]*-1 > 50000: + merged, realOL, mAdjust = merging.runBLAST(cont, tCont, edge[1], True) + if merged=="Removed": + print("Merging passed") + #removeMergedNEW(remove[0][:-2]+"_0", remove[1], remove[2]) + #removeMergedNEW(remove[0][:-2]+"_1", remove[3], remove[4]) + #else: + # print("Merging failed") + else: + + newEdge = globalVars.myEdge(cont, tCont, edge[0], edge[1], edge[2], 'r') + acceptedEdges.append(edge) + ins = False + for i, e in enumerate(edgeLIST): + if e < newEdge: + edgeLIST.insert(i, newEdge) + ins = True + break + + if ins == False: + edgeLIST.append(newEdge) + + #print("Accepted edges between %s and %s"%(cont, tCont)) + #for e in acceptedEdges: + # print(e) + #print("\n") + globalVars.edgeDICT[cont][tCont]=acceptedEdges + + return edgeLIST + + +# running GPMA from gpma.py +def runGPMA(orderedEdges, cGRAPH): + writer=open(globalVars.outDir + "/tmp_edges", "w") + for i, edge in enumerate(orderedEdges): + #print("Running GPMA trying to set edge (%i/%i):"%(i, len(orderedEdges)-1)) + #print(edge) + gpma(edge, cGRAPH) + writer.write(str(edge)+"\n") + writer.close() + +#function to cluster variants per contig +def clusterSVs(svList, cMap, writer, ms, misJoin=False, deletion=False): + #to avoid unnecessary checking + if len(svList) < 2: + return [] + + breaks=[] + import networkx as nx + G = nx.Graph() + nINS=len(svList) + #print(svList) + for i, sv1 in enumerate(svList): + for j in range(i+1, nINS): + sv2 = svList[j] + #print(sv1, sv2) + sd = abs((sv1[2]-sv1[1]) - (sv2[2]-sv2[1])) / max((sv1[2]-sv1[1]), (sv2[2]-sv2[1]), 1) + pd = min(abs(sv1[1]-sv2[1]), abs(sv1[2]-sv2[2]), abs((sv1[2]+sv1[1])/2 - (sv2[2]+sv2[1])/2)) + #print(sd, pd) + spd=sd+pd/2000 + #print(spd) + if spd < 0.7: + #print("CLUSTER TOGETHER") + G.add_edge(i, j) + #print("\n") + insClusts=nx.find_cliques(G) + #print("nClusters found:") + for iClust in insClusts: + #print(len(iClust)) + #print(iClust) + if len(iClust) > 2: + svB=[] + svE=[] + svW=0 + scaf="" + for i in iClust: + #print(svList[i]) + ins = svList[i] + scaf=ins[0] + svB.append(ins[1]) + svE.append(ins[2]) + svW+=1 + #print(ins) + + + #print(svB, svE) + + if misJoin==True: + svB=int(statistics.mean(svB)) + svE=int(statistics.mean(svE)+100) + if svE >= len(cMap): + if svB >= len(cMap): + continue + svE=len(cMap)-1 + #print(svB, svE) + meanGood=round(statistics.mean(cMap[svB:svE+1])) + meanSTD=round(statistics.stdev(cMap[svB:svE+1])) + meanGood+=(-3)*meanSTD + else: + svB=int(statistics.mean(svB)) + svE=int(statistics.mean(svE)) + + meanGood=round(statistics.mean(cMap[svB:svE+1])) + if deletion==True: + svW+=meanGood + #print(svB, svE, svW, meanGood, svW/(svW+meanGood)) + if svW > 2 and svW/(svW+meanGood)>=0.7: + #print("ACCEPT SV!!!") + #print(cMap[svB:svE+1]) + #print(svB, svE, svW, meanGood, svW/(svW+meanGood)) + #breaks.append((svB, svE)) + breaks.append(svB) + breaks.append(svE) + writer.write("%s\t%i\t%i\t%i\t%i\t%f\n"%(scaf ,svB, svE, svW, meanGood, svW/(svW+meanGood))) + cMap[svB]-=svW + if svB!=svE: + cMap[svE]-=svW + + + G.clear() + + return breaks + +# function to break contigs given a +def breakConts(bHASH, cit): + oWriter=open(args.outdir+"/"+"adjusted_contigs_it%i.fa"%(cit), "w") + for scaf in globalVars.contDICT: + if scaf in bHASH: + curStart=0 + counter=1 + breakEMargin=500 + for error in bHASH[scaf]: + #print(curStart, error, error-breakEMargin) + #only output fragments larger than 1000bp + if len(globalVars.contDICT[scaf][curStart:error-breakEMargin]) > 1000: + oWriter.write(">%s_%i\n"%(scaf, counter)) + #print("Write splitted scaf: %s from: %i-%i" %(scaf, curStart, error-breakEMargin)) + oWriter.write("%s\n"%(globalVars.contDICT[scaf][curStart:error-breakEMargin])) + + curStart=error+breakEMargin + counter+=1 + if len(globalVars.contDICT[scaf][curStart:]) > 1000: + oWriter.write(">%s_%i\n"%(scaf, counter)) + #print("Write splitted scaf: %s from: %i-%i" %(scaf, curStart, len(globalVars.contDICT[scaf]))) + oWriter.write("%s\n"%(globalVars.contDICT[scaf][curStart:])) + else: + oWriter.write(">%s\n"%(scaf)) + oWriter.write("%s\n"%(globalVars.contDICT[scaf])) + oWriter.close() + +# function that generates the pos. coverage map per contig +# and then runs clusterSVs on the different SVs for that contig +# and then checks if any contigs must be broken (breakConts) +def checkContigsNEW(curIt, noBreak): + svbreak=False + covWriter=open(globalVars.outDir + "/coverage_map_it%i.txt"%(curIt), "w") + insWriter=open(globalVars.outDir + "/insertions_it%i.txt"%(curIt), "w") + invWriter=open(globalVars.outDir + "/inversions_it%i.txt"%(curIt), "w") + delWriter=open(globalVars.outDir + "/deletions_it%i.txt"%(curIt), "w") + mjWriter=open(globalVars.outDir + "/misjoins_it%i.txt"%(curIt), "w") + svBreaks={} + for scaf in globalVars.goodE: + #print("Checking scaf %s"%(scaf)) + covMap=[0]*globalVars.contLenDICT[scaf] + breakpoints=[] + for g in globalVars.goodE[scaf]: + for i in range(g[1],g[2]+1): + covMap[i]+=1 + meanSupport=statistics.median(covMap[50:-50]) + #print("cont %s has a mean goodness of scaf %f"%(scaf, meanSupport)) + #print(covMap) + #print("INSERTIONS:") + breakpoints+=clusterSVs(globalVars.insE[scaf], covMap, insWriter, meanSupport) + #print("DELETIONS:") + breakpoints+=clusterSVs(globalVars.delE[scaf], covMap, delWriter, meanSupport, deletion=True) + #print("INVERSIONS:") + breakpoints+=clusterSVs(globalVars.invE[scaf], covMap, invWriter, meanSupport) + #print("MISJOINS:") + breakpoints+=clusterSVs(globalVars.mjE[scaf], covMap, mjWriter, meanSupport, misJoin=True) + + if breakpoints!=[]: + svbreak=True + svBreaks[scaf]=breakpoints + + checkCovMap(scaf, covMap, covWriter) + + + if svbreak==True and noBreak==False: + breakConts(svBreaks, curIt) + + covWriter.close() + insWriter.close() + delWriter.close() + invWriter.close() + mjWriter.close() + + return svbreak + +# outputs the coverage map +def checkCovMap(curCont, cmap, oWriter): + oWriter.write(">%s\n"%(curCont)) + totalBps=0 + nGood=0 + nBad=0 + nUnknown=0 + curRange=["", "", 0] + + for pos, cov in enumerate(cmap): + totalBps+=1 + if cov>0: + nGood+=1 + elif cov==0: + nUnknown+=1 + else: + nBad+=1 + if pos==0: + curRange[0]=pos + curRange[1]=pos + curRange[2]=cov + else: + if curRange[2]==cov: + curRange[1]=pos + else: + oWriter.write("\t%s\t%i\t%i\t%i\n"%(curCont, curRange[0], curRange[1], curRange[2])) + curRange[0]=pos + curRange[1]=pos + curRange[2]=cov + oWriter.write("\t%s\t%i\t%i\t%i\n"%(curCont, curRange[0], curRange[1], curRange[2])) + + +# function to write a scaffold for final output +def writeScaf(sSeqs, sNames, oNR, sWriter, gffWriter): + #Write output + newSeq = "" + #print("writing out") + + + for i, name in enumerate(sNames): + seq = sSeqs[i] + start=len(newSeq)+1 + newSeq += seq + end=len(newSeq) + if name.startswith("read_"): + gffWriter.write("scaf_%i\t.\tread(%s)\t%i\t%i\n"%(oNR, name, start, end)) + else: + gffWriter.write("scaf_%i\t.\tcontig(%s)\t%i\t%i\n"%(oNR, name, start, end)) + + + scafLen = len(newSeq) + sWriter.write(">scaf_%i(len=%i)\n" % (oNR, scafLen)) + # print(newName) + # print(newSeq) + myIndex = 0 + # fasta format capped at 80bps per line + while (myIndex + 80) <= len(newSeq): + sWriter.write("%s\n" % (newSeq[myIndex:myIndex + 80])) + myIndex += 80 + sWriter.write("%s\n" % (newSeq[myIndex:])) + + return scafLen + + + +# function to output all contigs/scaffolds in the end --> gets the path +# for each and then puts them together before writing out +def getPATH(myGraph): + scafWriter = open(globalVars.outDir + "/scaffolds.fasta", "w") + scafCount = 0 + idWriter = open(globalVars.outDir + "/conts_scafs.ids", "w") + gffWriter = open(globalVars.outDir + "/scaffolds.gff", "w") + # hash to keep track of already visited contigs + cLens=[] + visited = {} + count=1 + #print(len(myGraph.cGraph)) + for cont in myGraph.cGraph: + #print("Checking cont %s - %i/%i"%(cont, count, len(myGraph.cGraph))) + count+=1 + if not cont[:-2] in visited: + startNode = myGraph.findStart(copy.copy(cont)) + #print(myGraph.getPath(copy.copy(cont))) + first=True + #print("Getting path for cont %s start of path is at: %s"%(cont, startNode)) + + newScaf = mySCAF([], []) # intitalize output scaf + idWriter.write(">scaf_%i\n"%(scafCount)) + curNode=copy.copy(startNode) + while 1: + #print("while curNode=%s"%(curNode)) + # check if already visited + if not curNode[:-2] in visited: + visited[curNode[:-2]]=1 + else: + #print("CIRCLE FOUND") + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + visited[curNode[:-2]] = 1 + #print("visited %s"%(curNode[:-2])) + + #case for startNode set dist to 0 + if first==True: + first=False + offset=0 + else: + offset = myGraph.cGraph[curNode].active.dist *(-1) # change pos/neg to get offset for contigSeq + if myGraph.cGraph[curNode].active != "": + idWriter.write("\tdist=%i, links=%i\n"%(offset*-1, myGraph.cGraph[curNode].active.weight)) + else: + idWriter.write("\tdist=%i\n"%(offset*-1)) + + + + # get scaf seq in correctOrientation + if curNode[-1]=="0": + contSeq = globalVars.contDICT[curNode[:-2]] + elif curNode[-1]=="1": # reverse complement seq + #contSeq=str(Seq(globalVars.contDICT[curNode[:-2]]).reverse_complement()) + contSeq=reverseComplement(globalVars.contDICT[curNode[:-2]]) + + + idWriter.write("\t%s(len=%i)\n"%(curNode, globalVars.contLenDICT[curNode[:-2]])) + + #fill gap + if offset < 0: # gap that must be filled exists + #print("Filling gap between %s and %s"%(myGraph.cGraph[curNode].active.target, curNode)) + fillID, fillSeq = fillEDGEgap(myGraph.cGraph[curNode].active.target, curNode, newScaf.seqs[len(newScaf.seqs)-1], contSeq, offset*-1) + #print("Filler = %s len=%i"%(fillID, len(fillSeq))) + + if fillID=="100N": + #print("Dont set edge") + del visited[curNode[:-2]] + myGraph.cGraph[curNode].active="" + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + else: + newScaf.names.append("read_"+fillID) + newScaf.seqs.append(fillSeq) + + offset = 0 + + #print("appending cont %s with offset %i"%(curNode, offset)) + newScaf.names.append(curNode) + #print("appending seq of len=%i"%(len(contSeq[offset:]))) + newScaf.seqs.append(contSeq[offset:]) + + curNode = myGraph.cGraph[curNode].cEdge.target + + if myGraph.cGraph[curNode].active=="": + #print("End of path reached") + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + else: + #print(myGraph.cGraph[curNode].active) + curNode = myGraph.cGraph[curNode].active.target + #print("Next in path is %s"%(curNode)) + #print(myGraph.cGraph[curNode].active) + + scafWriter.close() + idWriter.close() + gffWriter.close() + return cLens + + +# function to fill the gap between to contigs with a pacbio read +def fillEDGEgap(cont1, cont2, cSeq1, cSeq2, gLen): + gapFilled=False + #print("Trying to fill gap between %s and %s "%(cont1, cont2)) + #sort contigs to get same order as before for edge connection + if cont1 < cont2: + node1 = cont1 + node2 = cont2 + else: + node1 = cont2 + node2 = cont1 + #print(node1, node2) + gapReads=globalVars.gapDICT[node1][node2] + + if len(cSeq1)>1000: + adjustedSeq1=cSeq1[-1000:] + adjustedLen1=1000 + else: + adjustedSeq1=cSeq1 + adjustedLen1=len(cSeq1) + if len(cSeq2)>1000: + adjustedSeq2=cSeq2[:1000] + adjustedLen2=1000 + else: + adjustedSeq2=cSeq2 + adjustedLen2=len(cSeq2) + + for read in gapReads: + #print("\tUsing read %s" %(read)) + rSeq=globalVars.readHash[read] + gapFilled=merging.fillGAPS(cont1, cont2, adjustedLen1, adjustedLen2, adjustedSeq1, adjustedSeq2, read, len(rSeq), rSeq) + + #if gap can be filled abort gapFilling process + if gapFilled!=False: + break + + if gapFilled==False: + read="100N" + gapFilled="N"*100 + #print("GAP could not be filled --> add 100 Ns in between???!!!") + #print(cont1, cont2) + + return read, gapFilled + +# calculates the N-statistics (N10,..,N50,..,N100) +def calcNstats(scafLens, totalLen): + Nvals=[(0,0)]*10 + n=0 + curSum=0 + curN=0.1 + curPos=0 + for slen in scafLens: + curSum+=slen + #print(curN, curPos, curSum, n, slen) + n+=1 + curThresh=totalLen*curN + #print("cur threshold %f"%curThresh) + if curSum >= curThresh: + curN=round(curN+0.1, 2) + Nvals[curPos]=(slen, n) + curPos+=1 + + return(Nvals) + +# function to generate and ouput the scaffolds statistics +def scafStats(sLens): + writer=open(globalVars.outDir + "/scaffolds.stats", "w") + sortedLens=sorted(sLens, reverse=True) + totalBps=sum(sLens) + totalScafs=len(sLens) + avgLen=statistics.mean(sLens) + sortedLens=sorted(sLens, reverse=True) + + writer.write("Total number basepairs in assembly: %i\n"%(totalBps)) + writer.write("Total number of scaffolds: %i\n"%(totalScafs)) + writer.write("Longest scaffold: %i\n"%(sortedLens[0])) + writer.write("Average scaffold length: %i\n"%(avgLen)) + + nVs=calcNstats(sortedLens, totalBps) + + names=["N10","N20","N30","N40","N50","N60","N70","N80","N90","N100"] + for i, nv in enumerate(nVs): + writer.write("%s = %i, n = %i\n"%(names[i], nv[0], nv[1])) + + writer.close() + +# function to remove all tmp files created while running the script +def removeTMPfiles(): + subprocess.call("rm %s/tmp*" % (globalVars.outDir), shell=True) + + + +########################################################################### +###########################RUN EVERYTHING################################## +def run(genome, nobreak, curIter): + # create output directory if it not exists already + subprocess.call("mkdir -p %s" % (args.outdir), shell=True) + + miniPATH, blPATH = parsePATHs() + + # initlaiye global variables + if curIter > 0: + tempStore=globalVars.readHash + globalVars.init(args.outdir, args.size, args.minlinks, args.minoverlap, args.minident, args.threads, args.minSVLen, blPATH, miniPATH) + + if curIter>0: + globalVars.readHash=tempStore + tempStore="" + contigGraph=globalVars.coGRAPH() + + print("0a) Parsing Contigs") + parseContigs(genome, contigGraph) + + + t0=datetime.datetime.now() + splitFile = args.outdir + "/tmp_%ibp_splits" % (args.size) + if curIter == 0: + print("\n0b) Splitting reads into %ibp chunks" % (args.size)) + if args.fasta: + splitReads.splitReadsFA(args.fastq, splitFile, args.size) + else: + splitReads.splitReadsFQ(args.fastq, splitFile, args.size) + t1=datetime.datetime.now() + splittime=t1-t0 + + print("\n1) Running minimap2") + # name for output file + t0=datetime.datetime.now() + myPAF = globalVars.outDir + "/tmp_it%i.paf"%(curIter) + runMinimap2(args.threads, mapMode, genome, splitFile, myPAF) + t1 = datetime.datetime.now() + maptime=t1-t0 + + print("\n2) Parsing PAF") + t0=datetime.datetime.now() + edgeSet = parsePAF(myPAF) + t1=datetime.datetime.now() + parseTime=t1-t0 + + #avgCOV=statistics.median(globalVars.largestMapped) + #print(globalVars.largestMapped) + #print("\tCalculated average coverage: %i"%(avgCOV)) + + print("\n3) Clustering and identification of SVs and misjoins") + t0 = datetime.datetime.now() + restart=checkContigsNEW(curIter, nobreak) + t1 = datetime.datetime.now() + svTime=t1-t0 + #restart, contStats = checkContigs(curIter, nobreak) + if restart==True and nobreak==False: + print("\tContigs were adjusted restart needed!!!") + return False + else: + + # check if contigs need to be readjusted if so --> readjust and rerun + # mapping before running GPMA + print("\n4) Running GPMA") + print("\tNumber of edges: %i" %(len(edgeSet))) + t0 = datetime.datetime.now() + runGPMA(edgeSet, contigGraph) + t1 = datetime.datetime.now() + gpmaTime=t1-t0 + + # get the paths between contigs for output + print("\n5) Outputting scaffolds and filling gaps") + t0 = datetime.datetime.now() + scafLengths=getPATH(contigGraph) + t1 = datetime.datetime.now() + getPathTime=t1-t0 + + haploWriter = open(globalVars.outDir + "/duplicates.fasta", "w") + print("\n6) Outputting unscaffolded duplicates") + #output haplotigs + visited={} + for ele in globalVars.adjustDICT: + sID=ele[:-2] + if visited.get(sID, "nope")==1: + continue + + contSeq = globalVars.contDICT[sID] + haploWriter.write(">%s\n%s\n"%(sID, contSeq)) + + visited[sID]=1 + haploWriter.close() + + + #get stats + scafStats(scafLengths) + + + # cleaning up: + print("\n7) Cleaning up - removing tmp files") + removeTMPfiles() + + + print("\n\n") + + etime = datetime.datetime.now() + totaltime = etime - stime + #print(globalVars.contsMerged) + print("Total elapsed time: " + str(totaltime) + " [h:min:sec:millisec]") + print("0) Splitting time: " + str(splittime) + " [h:min:sec:millisec]") + print("1) Mapping time(minimap2): " + + str(maptime) + " [h:min:sec:millisec]") + print("2) Parsing time: " + str(parseTime) + " [h:min:sec:millisec]") + print("2) SV clustering time: " + str(svTime) + " [h:min:sec:millisec]") + print("4) GPMA time: " + str(gpmaTime) + " [h:min:sec:millisec]") + #print("\tTotal BLASTS done: %i"%(globalVars.blastCounter)) + print("5) GetPath time: " + str(getPathTime) + " [h:min:sec:millisec]") + + + return True + + + +########################################################################## +########################################################################## +########################################################################## +parser = argparse.ArgumentParser(description="") +parser.add_argument( + "-ge", "--genome", help="Contigs in fasta format", type=str, required=True) +parser.add_argument( + "-fq", "--fastq", help="Reads in fastq format", type=str, required=True) +parser.add_argument( + "-out", "--outdir", help="Directory for output", type=str, required=True) +parser.add_argument( + "-m", "--mode", help="Mapping mode in minimap2 for different types of input data data. Options: pb/ont [default=pb]", type=str, default="pb") +parser.add_argument( + "-t", "--threads", help="Threads to use for minimap2 only [default=4]", type=int, default=4) +parser.add_argument( + "-ml", "--minlinks", help="Minimum reads supporting a link needed to set an edge between two contigs [default=2]", type=int, default=2) +parser.add_argument( + "-mo", "--minoverlap", help="Minimum overlap needed for two contigs to be merged [default=50]", type=int, default=50) +parser.add_argument( + "-mi", "--minident", help="Minimum identity in overlap needed for two contigs to be merged [default=85.0]", type=float, default=85.0) +parser.add_argument( + "-it", "--iterations", help="maximal iterations performed for breaking set to 0 to perfrom NO breaking", type=int, default=2) +parser.add_argument( + "-msvl", "--minSVLen", help="Miniumum length of a structural variant (insertion, deletion, inversion) to break a contig for [default=1000]", type=int, default=1000, choices=range(500, 10001), metavar="[500, 10000]") +parser.add_argument( + "-s", "--size", help="Size to split the input reads into (in bps) [default=1000]", type=int, default=1000) +parser.add_argument( + "-fa", "--fasta", help="Enable if input reads are in fasta format", action="store_true") +args = parser.parse_args() + + +########################################################################## +if __name__ == "__main__": + # starttime + stime = datetime.datetime.now() + # get right mode + if args.mode == "pb": + mapMode = 'map-pb' + elif args.mode == "ont": + mapMode = 'map-ont' + + + genome=args.genome + curIt=0 + mode=False + while curIt <= args.iterations: + print("---------------------------------------------ITERATION %i---------------------------------------------"%(curIt)) + if curIt == args.iterations: + mode=True + end=run(genome, mode, curIt) + genome=args.outdir+"/"+"adjusted_contigs_it%i.fa"%(curIt) + curIt+=1 + if end == True: + curIt=args.iterations+1 + print("\n\n") \ No newline at end of file diff --git a/merging.py b/merging.py new file mode 100644 index 0000000000000000000000000000000000000000..ec236e706e8ba77924c317745bcd5863be570cac --- /dev/null +++ b/merging.py @@ -0,0 +1,472 @@ +import globalVars +import gpma +import subprocess, datetime +try: + from subprocess import DEVNULL # py3k +except ImportError: + import os # for python2 + DEVNULL = open(os.devnull, 'wb') + + +#function to reverse complemet a sequence +def reverseComplement(seq): + old = "ACGT" + replace = "TGCA" + return seq.upper().translate(str.maketrans(old, replace))[::-1] + + +# function to run a blastn comparison between to contigs +def runBLAST(id1, id2, eOL, checkOL): + + #print("Merging %s and %s with estimasted OL:%i" % (id1, id2, estOL)) + seq1=globalVars.contDICT[id1[:-2]] + seq2=globalVars.contDICT[id2[:-2]] + seq1_trimmed=seq1 + seq2_trimmed=seq2 + if eOL*-1 >= len(seq1): + + if id2.endswith("1"): + seq2_trimmed=seq2[len(seq2)-eOL*-1 : (len(seq2)-eOL*-1)+len(seq1)] + elif id2.endswith("0"): + seq2_trimmed=seq2[(eOL*-1)-len(seq1) : eOL*-1] + if eOL*-1 >= len(seq2): + + if id1.endswith("1"): + seq1_trimmed=seq1[len(seq2)-eOL*-1 : (len(seq2)-eOL*-1)+len(seq2)] + elif id1.endswith("0"): + seq1_trimmed=seq1[(eOL*-1)-len(seq2) : eOL*-1] + + + + ''' + if checkOL==True: + #only blast overlaps against each other + #if contig longer than overlap + if len(seq1) > estOL*(-1)+500: + if id1.endswith("_1"): + seq1_trimmed=seq1[int(len(seq1)+(estOL-500)):] + elif id1.endswith("_0"): + seq1_trimmed=seq1[:int(estOL*(-1)+500)] + else: + print("WTF is this name - %s"%(id1)) + else: + seq1_trimmed=seq1 + checkOL=False + + if len(seq2) > estOL*(-1)+500: + if id2.endswith("_1"): + seq2_trimmed=seq2[int(len(seq2)+(estOL-500)):] + elif id2.endswith("_0"): + seq2_trimmed=seq2[:int(estOL*(-1)+500)] + else: + print("WTF is this name - %s"%(id2)) + else: + seq2_trimmed=seq2 + checkOL=False + else: + seq1_trimmed=seq1 + seq2_trimmed=seq2 + ''' + + tmpDIR = globalVars.outDir + + #print(len(seq1), len(seq1_trimmed)) + #print(len(seq2), len(seq2_trimmed)) + oWriter = open(tmpDIR + "/tmp1.fa", "w") + #oWriter.write(">%s\n%s\n" % (id1, seq1)) + oWriter.write(">%s\n%s\n" % (id1, seq1_trimmed)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp2.fa", "w") + #oWriter.write(">%s\n%s\n" % (id2, seq2)) + oWriter.write(">%s\n%s\n" % (id2, seq2_trimmed)) + oWriter.close() + + # define blastn output columns + outfmt = "'6 qseqid sseqid pident length qstart qend sstart send sstrand evalue bitscore'" + stime = datetime.datetime.now() + #nod threads --> 'num_threads' is currently ignored when 'subject' is specified. + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp1.fa", tmpDIR + "/tmp2.fa", tmpDIR + "/tmp.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + etime = datetime.datetime.now() + #print("Blast Done time: " + str(etime-stime) + " [h:min:sec:millisec]") + #increase BLAST Counter + globalVars.blastCounter+=1 + + + # length need to be this way around --> 2 first then 1 beacuse 1 was used + # as reference + sucess, rOL, mAdjust = tryMERGE(tmpDIR + "/tmp.blast6", id1, id2, len(seq1_trimmed), len(seq2_trimmed)) + + #if (rOL*(-1)==len(seq1_trimmed)-10 or rOL*(-1)==len(seq2_trimmed)-10) and checkOL==True: + # print("Full overlap rerun with more sequence!!!") + # return runBLAST(id1, id2, eOL, False) + + return sucess, rOL, mAdjust + + +# function for merging contigs by reading the blastn output +def tryMERGE(blastPATH, cID1, cID2, len1, len2): + blastReader = open(blastPATH) + + mergeAdjust="" + #just a initialization beacuse it needs a return value + alnLen=1 + + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO MERGING CAN BE DONE!!!") + merged = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + #print(line) + splitted = line.strip().split("\t") + # s2 first beacause s1 was used as reference + s2 = splitted[0] + s1 = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + s2_start = int(splitted[4]) - 1 + s2_end = int(splitted[5]) - 1 + s1_start = int(splitted[6]) - 1 + s1_end = int(splitted[7]) - 1 + s1_strand = splitted[8] + + #print("\tc1:%s len=%i ostart=%i oend=%i" % + # (cID1, len1, s1_start, s1_end)) + #print("\tc2:%s len=%i ostart=%i oend=%i" % + # (cID2, len2, s2_start, s2_end)) + + # to merge as input argument + if alnLen >= globalVars.minOverlap and ident >= globalVars.minIdent: + + #first check if alnLen is equal to len of one contig which means its fully part of the other and can be removed + if alnLen >= len1-10: + #print("Contig %s is fully part of contig %s and will be removed!!!"%(cID1, cID2)) + if s1_start < s1_end: + if s2_start < s2_end: + nTar0=cID2[:-2]+"_0" + aDist0=s2_start + nTar1=cID2[:-2]+"_1" + aDist1=len2-s2_end + elif s2_start > s2_end: + nTar0=cID2[:-2]+"_1" + aDist0=len2-s2_end + nTar1=cID2[:-2]+"_0" + aDist1=s2_start + elif s1_start > s1_end: + if s2_start < s2_end: + nTar0=cID2[:-2]+"_1" + aDist0=len2-s2_end + nTar1=cID2[:-2]+"_0" + aDist1=s2_start + elif s2_start > s2_end: + nTar0=cID2[:-2]+"_0" + aDist0=s2_start + nTar1=cID2[:-2]+"_1" + aDist1=len2-s2_end + + #removeMerged(globalVars.contigGR, cID1[:-2]+"_0", nTar0, aDist0) + #removeMerged(globalVars.contigGR, cID1[:-2]+"_1", nTar1, aDist1) + mergeAdjust=(cID1, nTar0, aDist0, nTar1, aDist1) + merged="Removed" + break + + elif alnLen >= len2-10: + #print("Contig %s is fully part of contig %s and will be removed!!!"%(cID2, cID1)) + if s1_start < s1_end: + if s2_start < s2_end: + nTar0=cID1[:-2]+"_0" + aDist0=s1_start + nTar1=cID1[:-2]+"_1" + aDist1=len1-s1_end + elif s2_start > s2_end: + nTar0=cID1[:-2]+"_1" + aDist0=len1-s1_end + nTar1=cID1[:-2]+"_0" + aDist1=s1_start + elif s1_start > s1_end: + if s2_start < s2_end: + nTar0=cID1[:-2]+"_1" + aDist0=len1-s1_end + nTar1=cID1[:-2]+"_0" + aDist1=s1_start + elif s2_start > s2_end: + nTar0=cID1[:-2]+"_0" + aDist0=s1_start + nTar1=cID1[:-2]+"_1" + aDist1=len1-s1_end + + #removeMerged(globalVars.contigGR, cID2[:-2]+"_0", nTar0, aDist0) + #removeMerged(globalVars.contigGR, cID2[:-2]+"_1", nTar1, aDist1) + mergeAdjust=(cID2, nTar0, aDist0, nTar1, aDist1) + merged="Removed" + break + + + # get orientations of contigs meaning which ends are overlapping + ori1 = cID1[-1] + ori2 = cID2[-1] + + # case1 c1_1 --> c2_0 + if ori1 == "1" and ori2 == "0": + # example: c1:ctg7180000370328_1 len=1919 ostart=1418 oend=1918 + # example: c2:ctg7180000371159_0 len=5383 ostart=0 oend=500 + # to make sure alns are at the borders + if s1_end + 10 >= len1 - 1 and s2_start - 10 <= 0: + newSeqID = cID1 + "__" + cID2 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case2 c1_1 --> c2_1 + elif ori1 == "1" and ori2 == "1": + #print("MERGE CASE 1-1") + # example: c1:ctg7180000370630_1 len=5524 ostart=5523 oend=5458 + # example: c2:jtg7180000371240f_7180000371241f_1 len=12170 + # ostart=12104 oend=12169 + if s1_start + 10 >= len1 - 1 and s2_end + 10 >= len2 - 1: + newSeqID = cID1 + "__" + cID2 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case3 c1_0 --> c2_1 + elif ori1 == "0" and ori2 == "1": + # example: c1:ctg7180000370268_0 len=3867 ostart=0 oend=419 + # example: c2:ctg7180000370715_1 len=8150 ostart=7730 oend=8149 + if s2_end + 10 >= len2 - 1 and s1_start - 10 <= 0: + newSeqID = cID2 + "__" + cID1 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case4 c1_0 --> c2_0 + elif ori1 == "0" and ori2 == "0": + # example: c1:ctg7180000370227_0 len=1568 ostart=415 oend=0 + # example: c2:ctg7180000371034_0 len=1684 ostart=0 oend=415 + if s1_end - 10 <= 0 and s2_start - 10 <= 0: + newSeqID = cID2 + "__" + cID1 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + #else: + #print("WHATDE WEIRD CONTIG ENDS!!!") + + if merged == True: + #print("\tMERGING SUCESSFULL!!!") + #print(newSeqID) + #if merging sucessful no reason to check more lines break out of loop ignore last alns + break + #else: + #print("\tMERGING FAILED!!!") + else: + #print("\tMERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + merged = False + + blastReader.close() + return merged, alnLen*-1, mergeAdjust + + +def saveMerge(nID, oID1, oID2, olap): + if oID1<=oID2: + combi=oID1+"__"+oID2 + else: + combi=oID2+"__"+oID1 + + globalVars.mergedDICT[combi]=(nID, olap) + + + +# function to properly remove key from a dictionary +def removeMerged(rdict, rkey, newTar, addDist): + print("Tyring to remove %s from dict"%(rkey)) + try: + print("Removing %s from dict"%(rkey)) + #part to adjust existing edge + existingEdge=rdict[rkey].active + globalVars.adjustDICT[rkey]=(newTar, addDist) + if existingEdge!="": + #print("existing Edge") + #print(existingEdge) + existingTarget=existingEdge.target + rdict[existingTarget].active="" + #print("Update edge with newTarget %s and added dist +%i"%(newTar, addDist)) + adjustedEdge=existingEdge + adjustedEdge.target=newTar + adjustedEdge.dist-=addDist # must be sustracted to properly adjust + #print("adjusted Edge") + #print(adjustedEdge) + #print("Rerun checkEdge fit with adjusted edge!") + #gpma.checkEdgeFit(adjustedEdge, rdict) + #gpma.gpma() + #remove contig from hash + del rdict[rkey] + + + + except KeyError: + print("keyError while removeing") + pass + + +def fillGAPS(cID1, cID2, cLen1, cLen2, cSeq1, cSeq2, rID, rLen, rSeq): + + tmpDIR = globalVars.outDir + oWriter = open(tmpDIR + "/tmp_cont1.fa", "w") + oWriter.write(">%s\n%s\n" % (cID1, cSeq1)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp_cont2.fa", "w") + oWriter.write(">%s\n%s\n" % (cID2, cSeq2)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp_read.fa", "w") + oWriter.write(">%s\n%s\n" % (rID, rSeq)) + oWriter.close() + + # define blastn output columns + outfmt = "'6 qseqid sseqid pident length qstart qend sstart send sstrand evalue bitscore'" + #no threads --> 'num_threads' is currently ignored when 'subject' is specified. + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp_cont1.fa", tmpDIR + "/tmp_read.fa", tmpDIR + "/tmp1.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp_cont2.fa", tmpDIR + "/tmp_read.fa", tmpDIR + "/tmp2.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + + ori1=cID1[-1] + ori2=cID2[-1] + + + #print("TRYING TO MERGE C1:%s AND READ:%s..."%(cID1, rID)) + blastReader = open(tmpDIR + "/tmp1.blast6") + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO C1 MERGING CAN BE DONE!!!!!!!") + merge1 = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + merge1=False + #print(line) + splitted = line.strip().split("\t") + rID = splitted[0] + cID = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + r_start = int(splitted[4]) - 1 + r_end = int(splitted[5]) - 1 + c_start = int(splitted[6]) - 1 + c_end = int(splitted[7]) - 1 + c_strand1 = splitted[8] + + #print("\tcontig:%s len=%i ostart=%i oend=%i cstrand=%s" % + # (cID1, cLen1, c_start, c_end, c_strand1)) + #print("\tread:%s len=%i ostart=%i oend=%i" % + # (rID, rLen, r_start, r_end)) + + if alnLen >= 0.3*globalVars.splitLen and ident >= 80: + end=max(c_start, c_end) + if end + 20 >= cLen1: + merge1=True + #print("\tGOOD C1 MERGING POSSIBLE FROM START!!!!!!!") + #get read starting position for concatenation + if c_strand1=="minus": + readStart=r_start + else: + readStart=r_end + #print(line) + break + + #else: + # print("\tERROR C1 MERGING FAILED ALN NOT AT BORDERS OF CONT!!!!!!!") + #else: + # print("\tC1 MERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + + # no need to test to merge C2 with read if C1 merge failed + if merge1==False: + #print("GAP FILLING FAILED!!!!!") + return False + + + blastReader.close() + + #print("TRYING TO MERGE C2:%s AND READ:%s..."%(cID2, rID)) + blastReader = open(tmpDIR + "/tmp2.blast6") + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO C2 MERGING CAN BE DONE!!!!!!!") + merge2 = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + merge2=False + #print(line) + splitted = line.strip().split("\t") + # s2 first beacause s1 was used as reference + rID = splitted[0] + cID = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + r_start = int(splitted[4]) - 1 + r_end = int(splitted[5]) - 1 + c_start = int(splitted[6]) - 1 + c_end = int(splitted[7]) - 1 + c_strand2 = splitted[8] + + #print("\tcontig:%s len=%i ostart=%i oend=%i cstrand=%s" % + # (cID2, cLen2, c_start, c_end, c_strand2)) + #print("\tread:%s len=%i ostart=%i oend=%i" % + # (rID, rLen, r_start, r_end)) + + if alnLen >= 0.3*globalVars.splitLen and ident >= 80: + start=min(c_start, c_end) + if start-20 <= 0: + merge2=True + #print("\tGOOD C2 MERGING POSSIBLE FROM START!!!!!!!") + #get read starting position for concatenation + if c_strand2=="minus": + readEnd=r_end + else: + readEnd=r_start + #print(line) + break + #else: + # print("\tERROR C2 MERGING FAILED ALN NOT AT BORDERS OF CONT!!!!!!!") + #else: + # print("\tC2 MERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + + blastReader.close() + + + if merge1==True and merge2==True: + if c_strand1 != c_strand1: + #print("Gap filling failed hits were in different directions") + return False + #print("FILLING GAP IS POSSIBLE!!!!!") + #print("rStart:%i rEnd%i"%(readStart, readEnd)) + #newID=cID1+"_"+rID+"_"+cID2 + #newSeq=cSeq1+rSeq[readStart:readEnd]+cSeq2 + if readStart<=readEnd: + return rSeq[readStart:readEnd] + else: + return reverseComplement(rSeq[readEnd:readStart]) + else: + #print("GAP FILLING FAILED!!!!!") + return False + diff --git a/minimap2 b/minimap2 new file mode 160000 index 0000000000000000000000000000000000000000..d90583b83cd81abd652abb025f39b8884937d59c --- /dev/null +++ b/minimap2 @@ -0,0 +1 @@ +Subproject commit d90583b83cd81abd652abb025f39b8884937d59c diff --git a/splitReads.py b/splitReads.py new file mode 100644 index 0000000000000000000000000000000000000000..fe57fa2694e49605a0406ef791c4cae459391125 --- /dev/null +++ b/splitReads.py @@ -0,0 +1,131 @@ +import globalVars + +class myFQ: + + def __init__(self, iD, seq, opt, qual): + self.iD = iD + self.seq = seq + self.opt = opt + self.qual = qual + + +class myFA: + + def __init__(self, iD, seq): + self.iD = iD + self.seq = seq + +def splitReadsFQ(inFQ, oFQ, splitSize): + + if inFQ.endswith(".gz"): + import gzip + fqParser = gzip.open(inFQ, mode='rt') + else: + fqParser = open(inFQ) + + myWriter = open(oFQ, "w") + + lineCount = 0 + seqCount = 0 + writtenREAD_ID = 1 + for line in fqParser: + lineCount += 1 + + if lineCount == 1: + rid=line[1:].rstrip().split("\t")[0].split(" ")[0] + curRead = myFQ(line.strip(), "", "", "") + elif lineCount == 2: + curRead.seq = line.strip() + elif lineCount == 3: + curRead.opt = line.strip() + elif lineCount == 4: + curRead.qual = line.strip() + + curStart = 0 + curEnd = splitSize + splitCount = 0 + + while curEnd < len(curRead.seq): + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + #splitID=str(rid) + "_" + str(splitCount) + myWriter.write("@" + splitID + "\n") + myWriter.write(str(curRead.seq[curStart:curEnd]) + "\n") + myWriter.write(str(curRead.opt) + "\n") + myWriter.write(str(curRead.qual[curStart:curEnd]) + "\n") + + splitCount += 1 + curStart += splitSize + curEnd += splitSize + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + #splitID=str(rid) + "_" + str(splitCount) + myWriter.write("@" + splitID + "\n") + myWriter.write(str(curRead.seq[curStart:]) + "\n") + myWriter.write(str(curRead.opt) + "\n") + myWriter.write(str(curRead.qual[curStart:]) + "\n") + + globalVars.readHash[str(writtenREAD_ID)]=curRead.seq + #globalVars.readHash[str(rid)]=curRead.seq + + writtenREAD_ID += 1 + seqCount += 1 + lineCount = 0 + + myWriter.close() + fqParser.close() + + +def splitReadsFA(inFA, oFA, splitSize): + + if inFA.endswith(".gz"): + import gzip + faParser = gzip.open(inFA) + else: + faParser = open(inFA) + + myWriter = open(oFA, "w") + + curID="" + seqCount = 0 + writtenREAD_ID = 1 + for line in faParser: + + if line.startswith(">"): + + if curID!="": + + curStart = 0 + curEnd = splitSize + splitCount = 0 + + while curEnd < len(curRead.seq): + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + curSeq=str(curRead.seq[curStart:curEnd]) + myWriter.write(">" + splitID + "\n") + myIndex=0 + # fasta format capped at 80bps per line + while (myIndex + 80) <= len(curSeq): + myWriter.write("%s\n" % (curSeq[myIndex:myIndex + 80])) + myIndex += 80 + myWriter.write("%s\n" % (curSeq[myIndex:])) + + splitCount += 1 + curStart += splitSize + curEnd += splitSize + + globalVars.readHash[str(writtenREAD_ID)]=curRead.seq + + writtenREAD_ID += 1 + seqCount += 1 + + curID=line.strip() + curRead = myFA(curID, "") + + else: + curRead.seq+=line.strip() + + + myWriter.close() + faParser.close()