From c19d91fa26e9b37d87eae63caf1437dc0c64f064 Mon Sep 17 00:00:00 2001 From: max <you@example.com> Date: Sun, 27 Oct 2019 14:51:20 +0100 Subject: [PATCH] init commit --- .gitmodules | 4 + README.md | 71 ++++ checkHitlist.py | 384 +++++++++++++++++++ globalVars.py | 249 +++++++++++++ gpma.py | 456 ++++++++++++++++++++++ main.py | 975 ++++++++++++++++++++++++++++++++++++++++++++++++ merging.py | 472 +++++++++++++++++++++++ minimap2 | 1 + splitReads.py | 131 +++++++ 9 files changed, 2743 insertions(+) create mode 100644 .gitmodules create mode 100644 checkHitlist.py create mode 100644 globalVars.py create mode 100644 gpma.py create mode 100755 main.py create mode 100644 merging.py create mode 160000 minimap2 create mode 100644 splitReads.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..429e53a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "minimap2"] + path = minimap2 + url = https://github.com/lh3/minimap2 + branch = master diff --git a/README.md b/README.md index 9463379..ff1375c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,73 @@ # masterTool +## Tool to Scaffold and Quality Control genome assemblies +<p align="center"> +<img src="graphics/simple_workflow.svg"> +</p> + +### Dependencies/Prerequisites: +1. Python3 (3.6.6) +→ script is executable (default python3 on machine) +→ should be downward compatible (Python2.6) +2. Biopython needs to be set up for used python version +→ pip/pip3 install biopython +3. NCBI-BLAST+ package (blastn) needs to be set up GLOBALLY +4. minimap2 set up within the script folder +→ Build on Ubuntu 18.04 might need to be compiled again depending on the machine (“make”) + +### Setup: +Clone and include minimap2 +git clone --recursive https://github.com/mdriller/masterTool.git +Move into minimap2 directoy and build it +cd masterTool/scripts/minimap2/ +make + +### Usage: +If python3 is in path the script can be called like: "./main.py" otherwise just "python main.py" +The help function can be accessed via: ./main.py -h|--help +To run the script needs a genome assembly in fasta format and long reads (e.g. PacBio or ONT) in fastq or fasta format. +>usage: main.py [-h] -ge GENOME -fq FASTQ -out OUTDIR [-m MODE] [-t THREADS] + [-mo MINOVERLAP] [-mi MINIDENT] [-it ITERATIONS] [-s SIZE] + [-fa] + +>>optional arguments: + -h, --help show this help message and exit + -ge GENOME, --genome GENOME + Contigs in fasta format + -fq FASTQ, --fastq FASTQ + Reads in fastq format + -out OUTDIR, --outdir OUTDIR + Directory for output files + -m MODE, --mode MODE Mode for different types of input data data. Options: + pb/ont [default=pb] + -t THREADS, --threads THREADS + Threads to use(during mapping for now...) [default=4] + -mo MINOVERLAP, --minoverlap MINOVERLAP + Minimum overlap needed for two contigs to be merged + [default=100] + -mi MINIDENT, --minident MINIDENT + Minimum identity in overlap needed for two contigs to + be merged [default=90.0] + -it ITERATIONS, --iterations ITERATIONS + maximal iterations performed for breaking set to 0 to + perfrom NO breaking + -s SIZE, --size SIZE Size in which the input reads will be split into + [default=500] + -fa, --fasta Enable if input reads are in fasta format + + Example Usage: + +./main.py -s 500 -ge /home/max/tests/genome.fasta -fq /home/max/tests/pacbioReads.fastq -out /home/max/tests/output -it 3 + +### Output files: + +1. scaffolds.fasta → final scaffolded assembly +2. haplotigs.fasta → removed “haplotigs” or (partially) duplicated contigs +3. adjusted_contigs.fa → set of adjusted contigs (if broken in first iteration) +4. conts_scafs.ids → Map contigIDs back to new scaffoldIDs +5. scaffolds_QC_iti.gff - gff file with the GOODNESS for each assembly i representing the reference used for the current iteration. + +### Workflow +<p align="center"> +<img src="graphics/workflow.svg"> +</p> diff --git a/checkHitlist.py b/checkHitlist.py new file mode 100644 index 0000000..25221ee --- /dev/null +++ b/checkHitlist.py @@ -0,0 +1,384 @@ +import globalVars + +# function to compare to fragments on same scaf to check for insertions +# and deletions +def checkINSDELS(hit1, hit2): + curError=None + curCorrect=None + #print("check INS/DELS") + hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + + if hit1.strand == "+" and hit2.strand == "+": + h1_end = min(hit1.t_end + (hit1.q_len - hit1.q_end), hit1.t_len-1) + h2_start = max(hit2.t_start - hit2.q_start, 0) + dist = h2_start - h1_end + diffDIST = dist - hdist + eStart=hit1.t_end + eEnd=hit2.t_start + cstart=hit1.t_start + cend=hit2.t_end + + elif hit1.strand == "-" and hit2.strand == "-": + h1_end = min(hit1.t_start - (hit1.q_len - hit1.q_end), hit1.t_len-1) + h2_start = max(hit2.t_end + hit2.q_start, 0) + dist = h1_end - h2_start + diffDIST = dist - hdist + eEnd=hit1.t_start + eStart=hit2.t_end-1 + cstart=hit1.t_end + cend=hit2.t_start + + if diffDIST >= globalVars.minSVL or diffDIST <= -1*globalVars.minSVL: + #print("Error detected") + #print(hit1) + #print(hit2) + #print(eStart, eEnd, diffDIST) + curError=(hit1.target, min(eStart, eEnd), max(eStart, eEnd), diffDIST) + #if diffDIST < 0: + # print("DELETION!!!") + else: + #print("Correct mapping") + curCorrect=(min(cstart, cend), max(cstart, cend)) + #print(curCorrect) + # print(hit1) + # print(hit2) + # print(curCorrect) + # print(curError) + return curError, curCorrect + + +# identify inversions +def checkINVS(hit1, hit2, oriHASH): + #print("check INVS") + # to store the hits/fragments of inversion --> start and end + #hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + #readOri = oriHASH[hit1.query] + + curError=None + + #if hit2.strand != oriHASH[hit2.query]: + invStart=min(hit1.t_start, hit2.t_start) + invEnd=max(hit1.t_start, hit2.t_start) + curError=(hit1.target, invStart, invEnd, invEnd-invStart) + return curError + + +def calcHitContig_Dist_rev(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_start - (hit1.q_len - hit1.q_end) # rest of contig 1 after extending aln to theoretical end can be negative + h2_start = hit2.t_len - (hit2.t_end + hit2.q_start)# rest of contig 2 after extending aln to theoretical end can be negative + # hdist->=distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if hit1.t_start > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + #print("RevRev") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Dist_pos(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_len - (hit1.t_end + (hit1.q_len - hit1.q_end)) + h2_start = hit2.t_start - hit2.q_start + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print(h2_start, hdist) + #print("add error %s for cont2"%(hit2.target)) + #print(hit2) + err2=True + + #print("PosPos") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Distance_posrev(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_len - (hit1.t_end + (hit1.q_len - hit1.q_end)) + h2_start = hit2.t_len - (hit2.t_end + hit2.q_start) + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if globalVars.contLenDICT[hit1.target]-hit1.t_end > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + #print("PosRev") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +def calcHitContig_Distance_revpos(hit1, hit2): + err1=False + err2=False + h1_end = hit1.t_start - (hit1.q_len - hit1.q_end) + h2_start = hit2.t_start - hit2.q_start + # hdist->distance between splits + if hit1.split != hit2.split: + hdist = ((hit2.split - hit1.split - 1) * hit1.q_len) + else: + hdist = 0 + # calc contig distance + cdist = hdist - (h1_end + h2_start) + if h1_end > hdist and h1_end > globalVars.splitLen: + #print("add error %s for cont1"%(hit1.target)) + #print(h1_end, hdist) + #print(hit1) + err1=True + if h2_start > hdist and h2_start > globalVars.splitLen: + #print("add error %s for cont2"%(hit2.target)) + #print(h2_start, hdist) + #print(hit2) + err2=True + ''' + if cdist<-1*globalVars.splitLen: + if hit1.t_start > globalVars.splitLen: + print("add error %s for cont1"%(hit1.target)) + print(hit1) + err1=True + if hit2.t_start > globalVars.splitLen: + print("add error %s for cont2"%(hit2.target)) + print(hit2) + err2=True + ''' + + #print("RevPos") + #print(err1, err2) + #print(hit1) + #print(h1_end) + #print(hit2) + #print(h2_start) + #print("\t%i\n"%(cdist)) + return cdist, err1, err2 + + +# compares 2 hits on different contigs --> adds edges +def connectCONTS(prevhit, curhit, edgeHash): + #print("running connectCONTS") + #print(prevhit) + #print(curhit) + #if alns not at borders add errors for breaking of misassemblies + errors=[] + if prevhit.strand == '+' and curhit.strand == '+': + # nodes to set connections from oriHA + startNode = prevhit.target + "_1" + endNode = curhit.target + "_0" + hDist, e1, e2 = calcHitContig_Dist_pos(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_end-100), min(prevhit.t_len-1, prevhit.t_end+100)) + #curError2=(curhit.target, max(0,curhit.t_start-100), min(curhit.t_len-1,curhit.t_start+100)) + curError1=(prevhit.target, prevhit.t_end+(prevhit.q_len-prevhit.q_end), prevhit.t_end+(prevhit.q_len-prevhit.q_end)) + curError2=(curhit.target, curhit.t_start-curhit.q_start, curhit.t_start-curhit.q_start) + + elif prevhit.strand == '-' and curhit.strand == '-': + # nodes to set connections from + startNode = prevhit.target + "_0" + endNode = curhit.target + "_1" + hDist, e1, e2 = calcHitContig_Dist_rev(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_start-100), min(prevhit.t_len-1,prevhit.t_start+100)) + #curError2=(curhit.target, max(0,curhit.t_end-100), min(curhit.t_len-1,curhit.t_end+100)) + curError1=(prevhit.target, prevhit.t_start+(prevhit.q_start), prevhit.t_start+(prevhit.q_start)) + curError2=(curhit.target, curhit.t_end+(prevhit.q_start), curhit.t_end+(prevhit.q_start)) + + elif prevhit.strand == '+' and curhit.strand == '-': + # nodes to set connections from + startNode = prevhit.target + "_1" + endNode = curhit.target + "_1" + hDist, e1, e2 = calcHitContig_Distance_posrev(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_end-100), min(prevhit.t_len-1,prevhit.t_end+100)) + #curError2=(curhit.target, max(0,curhit.t_end-100), min(curhit.t_len-1,curhit.t_end+100)) + curError1=(prevhit.target, prevhit.t_end+(prevhit.q_len-prevhit.q_end), prevhit.t_end+(prevhit.q_len-prevhit.q_end)) + curError2=(curhit.target, curhit.t_end+(prevhit.q_start), curhit.t_end+(prevhit.q_start)) + + elif prevhit.strand == '-' and curhit.strand == '+': + # nodes to set connections from + startNode = prevhit.target + "_0" + endNode = curhit.target + "_0" + hDist, e1, e2 = calcHitContig_Distance_revpos(prevhit, curhit) + #curError1=(prevhit.target, max(0,prevhit.t_start-100), min(prevhit.t_len-1,prevhit.t_start+100)) + #curError2=(curhit.target, max(0,curhit.t_start-100), min(curhit.t_len-1,curhit.t_start+100)) + curError1=(prevhit.target, prevhit.t_start+(prevhit.q_start), prevhit.t_start+(prevhit.q_start)) + curError2=(curhit.target, curhit.t_start-curhit.q_start, curhit.t_start-curhit.q_start) + + #if contigs had long overlap left + #if e1==True and prevhit.split==curhit.split-1: + if e1==True: + #print("APPEND MISJOIN", curError1) + #print(prevhit) + #print(curhit) + errors.append(curError1) + #if e2==True and prevhit.split==curhit.split-1: + if e2==True: + #print("APPEND MISJOIN", curError2) + #print(prevhit) + #print(curhit) + errors.append(curError2) + # sort both nodes lexicographically to not have 2 different edges between + # the same nodes e.g. one a->b and b->a in the dict + if startNode < endNode: + node1 = startNode + node2 = endNode + else: + node1 = endNode + node2 = startNode + + + # add edge to "first" node + if not node1 in edgeHash: + edgeHash[node1] = {} + edgeHash[node1][node2] = [hDist] + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1]={} + globalVars.gapDICT[node1][node2]=[curhit.query] + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + else: + if not node2 in edgeHash[node1]: + edgeHash[node1][node2] = [hDist] + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1][node2]=[curhit.query] + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + else: + edgeHash[node1][node2].append(hDist) + #print("added edge with weight %i for conts %s and %s"%(hDist, node1, node2)) + #save readID spanning the gap + globalVars.gapDICT[node1][node2].append(curhit.query) + #print("Read %s connects conts %s and %s with dist %i"%(curhit.query, node1, node2, hDist)) + #print(edgeHash[node1][node2]) + return errors + + +# general function to loop over each hit list and compare each split to +# the next one --> calls connectCONTs and checkCONT +def checkHitlist(hitlist, edgeGRAPH, oriHASH): + lastHit = "" + # compare 2 hits following each other + for curHits in hitlist: + + #if curHit.target==globalVars.largestContig[0]: + #print("hit on largest contig found --> add coverage") + # addLongestCOV(curHit) + #if len(curHits)>1: + # for curHit in curHits: + # globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) + + if len(curHits)==1: + curHit=curHits[0] + #globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) + #compare two hits + if lastHit!="": + #print("Comparing 2 hits:") + #print(lastHit) + #print(curHit) + + # check hits on same contig --> e.g. continuity + if curHit.target == lastHit.target: + if curHit.strand == lastHit.strand: + #print("Check for INS/DEL") + # check for inseinsertErrorrtions and deletions + cError, cGood= checkINSDELS(lastHit, curHit) + #print("GOOD:", cGood) + #print("BAD:", cError) + + if cError!=None: + if cError[3] > 0: + globalVars.insE[cError[0]].append(cError) + else: + globalVars.delE[cError[0]].append(cError) + else: + globalVars.goodE[curHit.target].append((curHit.target, cGood[0], cGood[1])) + + else: + #print("Check for INV") + # add inversions + cError=checkINVS(lastHit, curHit, oriHASH) + if cError!=None: + globalVars.invE[cError[0]].append(cError) + + # check contig connections --> add edges to graph + else: + #print("Check for CONTIG CONNECTION\n") + cErrors=connectCONTS(lastHit, curHit, edgeGRAPH) + for e in cErrors: + #e[2]+=100 + globalVars.mjE[e[0]].append(e) + + lastHit=curHit + + return edgeGRAPH \ No newline at end of file diff --git a/globalVars.py b/globalVars.py new file mode 100644 index 0000000..32c75ca --- /dev/null +++ b/globalVars.py @@ -0,0 +1,249 @@ +import sqlite3, subprocess +############################### +#######global variables######## +############################### + +################################### +###classes######################### +################################### + + +class coGRAPH: + + def __init__(self): + self.cGraph={} + + def addContigEdge(self, cID, sLen): + cEdge1 = myEdge(cID + "_0", cID + "_1", sLen, sLen, 0, 'c') + cEdge2 = myEdge(cID + "_1", cID + "_0", sLen, sLen, 0, 'c') + self.cGraph[cID + "_0"] = myCont(cEdge1, "", {}) + self.cGraph[cID + "_1"] = myCont(cEdge2, "", {}) + + def removeNode(self, cID): + del self.cGraph[cID] + print("%s had been removed"%(cID)) + + #function to return distance between node a and b if there is a path + def returnDist(self, a, b, circle=False): + visited=[] + restart=False + #print("getting distance") + #print(a,b) + #print(self.cGraph[a].active) + #print(self.cGraph[b].active) + #print(self.cGraph[a].cEdge) + #print(self.cGraph[b].cEdge) + if a in self.cGraph and b in self.cGraph: + #print("both in graph") + nextN=a + #print(nextN) + #print(self.cGraph[nextN].active) + dist=0 + while self.cGraph[nextN].active != "": + visited.append(nextN) + dist+=self.cGraph[nextN].active.dist + nextN=self.cGraph[nextN].active.target + #print(nextN, b) + if nextN==b: + #print("FOUND IT") + if dist==0: + dist=1 + return(dist) + nextN=self.cGraph[nextN].cEdge.target + #print(nextN) + if nextN in visited: + restart=True + break + visited.append(nextN) + if circle==True: + if nextN==b: + #print("FOUND IT") + if dist==0: + dist=1 + return(dist) + + if restart==True: + #print("CIRCLE DETECTED RESTART") + return self.returnDist(a, b, True) + + return False + + else: + print("wut not both in graph???") + return False + + + # function to find start of a path starting with a node in the path + def findStart(self, anode): + inPath=[] + #print("Finding start for node %s"%(anode)) + # just to make sure to alayws start at 0 end of contig and then go to the + # left from there + if anode.endswith("0"): + cN = anode + elif anode.endswith("1"): + cN = self.cGraph[anode].cEdge.target + else: + print("ERROR incorrect scaffold name found %s" %(anode)) + + inPath.append(cN[:-2]) + + while self.cGraph[cN].active != "": + cN = self.cGraph[cN].active.target + #print(cN) + if cN[:-2] in inPath: + print("CIRCLE FOUND!!!!") + #print(self.cGraph[cN].cEdge) + #print(self.cGraph[cN].active) + return cN + inPath.append(cN[:-2]) + # move to other end of the contig for possible other read edge --> + # otherwse endless loop + # print(cN) + cN = self.cGraph[cN].cEdge.target + #print("Start of path found: %s"%(cN)) + return cN + + def getPath(self, anode): + inPath=[] + startNode=self.findStart(anode) + curNode=startNode + while 1: + if curNode in inPath: + print("getPath - Circle found!!!") + exit() + + inPath.append(curNode) + curNode=self.cGraph[curNode].cEdge.target + inPath.append(curNode) + + if self.cGraph[curNode].active == "": + print("End of path reached") + return inPath + else: + curNode=self.cGraph[curNode].active.target + + + + + # returns an alternating path of edges + def returnEgdePath(self, anode): + inPath=[] + ePath=[] + pHash={} + curDist=0 + startNode = self.findStart(anode) + curNode=startNode + inPath.append(startNode[:-2]) + pHash[startNode]=(curDist, 'a') + ePath.append(self.cGraph[curNode].cEdge) + curDist+=self.cGraph[curNode].cEdge.dist + curNode=self.cGraph[curNode].cEdge.target + + while self.cGraph[curNode].active != "": + pHash[curNode]=(curDist, 'b') + ePath.append(self.cGraph[curNode].active) + curDist+=self.cGraph[curNode].active.dist + curNode=self.cGraph[curNode].active.target + pHash[curNode]=(curDist, 'a') + ePath.append(self.cGraph[curNode].cEdge) + curDist+=self.cGraph[curNode].cEdge.dist + curNode=self.cGraph[curNode].cEdge.target + + if not curNode[:-2] in inPath: + inPath.append(curNode[:-2]) + + return ePath, pHash + + + +class myCont: + + def __init__(self, cEdge, active, eList): + self.cEdge=cEdge + self.active=active + self.eList=eList + + +# class for an edge +class myEdge: + + def __init__(self, start, target, weight, dist, stdev, direct): + self.start = start + self.target = target + self.weight = weight + self.dist = dist + self.stdev = stdev + self.direct = direct + + # comparing edes will comapre their weights + def __lt__(self, other): + return self.weight < other.weight + + def __gt__(self, other): + return self.weight > other.weight + + # printing function + def __str__(self): + return("\t%s, %s, weight:%i, dist:%i, stdev:%f, %s" % + (self.start, self.target, self.weight, self.dist, self.stdev, self.direct)) + + +# class for outputting scaffolds in the end +class mySCAF: + + def __init__(self, names, seqs, dirs): + self.names = names + self.seqs = seqs + self.dirs = dirs + + +# function to initialize global variables +def init(out, sL, mL, mO, mI, t, mSV, bPATH, mPATH): + global largestContig, distError, outDir, splitLen, minOverlap, minLinks + global minIdent, threads, contsMerged, contLenDICT, contDICT, mergedDICT, readHash + global contigGR, adjustDICT, gapDICT, eDICT, edgeDICT + global blastCounter, poserrorHASH, minSVL + global blastP, minimap2P + + global insE, delE, invE, mjE, goodE + insE={} + delE={} + invE={} + mjE={} + goodE={} + + blastCounter=0 + poserrorHASH={} + readHash={} + + outDir = out + splitLen = sL + #variable for largest contig + largestContig=("", 0) + #largestMapped=[] + #distance error is margin of read mappings + distError=float(splitLen)*0.15*2 + minLinks = mL + minOverlap = mO + minIdent = mI + threads = t + contsMerged=0 + minSVL=mSV + + contLenDICT = {} + contDICT = {} + # saves the ids of merged contigs + mergedDICT = {} + # contig GRAPH + contigGR = {} + # adjust hash stores edges and saves the new edge to which they should be + # redirected and the new distance that needs to be added to it + adjustDICT = {} + #for errors --> key scafID --> returns reads that need to be mapped back + eDICT = {} + # store readIDs of read spanning gaps + gapDICT={} + # for contig edges implied by hits + edgeDICT={} + diff --git a/gpma.py b/gpma.py new file mode 100644 index 0000000..e964d82 --- /dev/null +++ b/gpma.py @@ -0,0 +1,456 @@ +import globalVars +import merging +import copy + +################################## +################################## +######functions################### +################################## +################################## + + +# function to set edges in graph +def setEdges(setEdge, agraph): + + + if setEdge.direct!="": + newEdgeStart = globalVars.myEdge(setEdge.start, setEdge.target, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + newEdgeTarget = globalVars.myEdge(setEdge.target, setEdge.start, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + else: + newEdgeStart="" + newEdgeTarget="" + #print(setEdge) + #print("\tLooks good --> Setting edge: %s" % (newEdgeStart)) + #print("\tLooks good --> Setting edge: %s" % (newEdgeTarget)) + + oldEdge1=copy.copy(agraph.cGraph[setEdge.start].active) + oldEdge2=copy.copy(agraph.cGraph[setEdge.target].active) + if oldEdge1=="": + oldEdge1=globalVars.myEdge(setEdge.start, setEdge.target, 0, 0, 0, "") + if oldEdge2=="": + oldEdge2=globalVars.myEdge(setEdge.target, setEdge.start, 0, 0, 0, "") + agraph.cGraph[setEdge.start].active = newEdgeStart + agraph.cGraph[setEdge.target].active = newEdgeTarget + oldEdge=[oldEdge1, oldEdge2] + #print("OLD EDGE") + #print(oldEdge[0]) + #print(oldEdge[1]) + return oldEdge + +def resetEdge(setEdge, agraph): + #print("resetting edge") + #print(setEdge) + if setEdge.direct!="": + newEdgeStart = globalVars.myEdge(setEdge.start, setEdge.target, + setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) + else: + newEdgeStart="" + + agraph.cGraph[setEdge.start].active = newEdgeStart + +#function to adjust the gap dictionary in case +def adjustGAPdict(new1, new2, old1, old2): + + #print("Adjusting gap dict from %s to %s by taking them from %s to %s"%(new1, new2, old1, old2)) + + if old1 < old2: + n1 = old1 + n2 = old2 + else: + n1 = old2 + n2 = old1 + #print(n1, n2) + gapReads=globalVars.gapDICT[n1][n2] + + if new1 < new2: + node1 = new1 + node2 = new2 + else: + node1 = new2 + node2 = new1 + if globalVars.gapDICT.get(node1, "nope")=="nope": + globalVars.gapDICT[node1]={} + globalVars.gapDICT[node1][node2]=[] + elif globalVars.gapDICT[node1].get(node2, "nope")=="nope": + globalVars.gapDICT[node1][node2]=gapReads + else: + globalVars.gapDICT[node1][node2]+=gapReads + +# function to properly remove key from a dictionary +def removeMergedNEW(rkey, newTar, addDist): + + #print("Removing %s from dict"%(rkey)) + #print("Now pointing to %s with dist %i"%(newTar, addDist)) + globalVars.adjustDICT[rkey]=(newTar, addDist) + +def checkMerge(curEdge, cHASH): + #print(curEdge.dist, globalVars.minOverlap*-1) + #print(curEdge.dist <= globalVars.minOverlap*-1) + #print("\tContigs are apparently overlapping - check for overlap for possible merging") + #check if merging is possible and get the real overlap + merged, realOL, mAdjust = merging.runBLAST(curEdge.start, curEdge.target, curEdge.dist, True) + #if cant be merged still set edge with distance 0 + #print("\t"+str(merged)) + if merged==True: + #print("\tsucessfully merged") + #print(curEdge.start, curEdge.target, curEdge.dist, realOL, curEdge.dist*(-1)-realOL*(-1)) + curEdge.dist=realOL + + else: + if merged=="Removed": + #removeMergedNEW(cHASH, mAdjust[0][:-2]+"_0", mAdjust[1], mAdjust[2]) + #removeMergedNEW(cHASH, mAdjust[0][:-2]+"_0", mAdjust[3], mAdjust[4]) + curEdge.dist=realOL + #print("Cont %s was part of the other -> abort for now"%(mAdjust[0][:-2])) + + return merged, mAdjust + +def mergeNEW(nEdge, inGraph): + graph=inGraph.cGraph + sucess=True + edgeList=[] + removal=[] + #print("Start Merging Paths") + inPath=[] + curEdge=copy.copy(nEdge) + end=False + zipper='r' + #zip right + firstL=False + while end==False: + #print("while") + #print(curEdge) + if curEdge.start[:-2] == curEdge.target[:-2]: + #print("\tEdge with itself - skip") + end=True + break + + #if original edge is supposed to overlap contigs check for overlap first + #abort if overlap not there + if curEdge.dist < 0: + merged, eAdjust=checkMerge(curEdge, inGraph) + if merged==False: + if curEdge.dist > (-2)*globalVars.distError: + #print("Merging failed but distance within range --> set to 1N for now") + curEdge.dist=1 + else: + #print("Merging failed!!!") + sucess=False + return sucess, [], [] + if merged == "Removed": + next1=copy.copy(graph[curEdge.start].active) + next2=copy.copy(graph[curEdge.target].cEdge) + next22=copy.copy(graph[next2.target].active) + #edgeList.append(copy.copy(curEdge)) + if curEdge.dist*(-1) >= globalVars.contLenDICT[curEdge.target[:-2]]-20: + #print("Duplicate case1: contig %s is part of %s"%(curEdge.target, curEdge.start)) + if next22=="": + #print("End of path reached") + removal.append(eAdjust) + if graph[curEdge.target].active!="": + curEdge=copy.copy(graph[curEdge.target].active) + #print("Trying to zipper left path from merged cont guide edge:") + #print(curEdge) + else: + #print("nothing left to merge done here") + break + else: + curEdge.start=curEdge.start + curEdge.target=next22.target + curEdge.dist=curEdge.dist + (next2.dist + next22.dist) + #print("Merged implied edge:") + #print(curEdge) + removal.append(eAdjust) + #next iteration + continue + elif curEdge.dist*(-1) >= globalVars.contLenDICT[curEdge.start[:-2]]-20: + #print("Duplicate case2: contig %s is part of %s"%(curEdge.start, curEdge.target)) + if next1=="": + #print("End of path reached") + removal.append(eAdjust) + otherEnd = graph[curEdge.start].cEdge.target + if graph[otherEnd].active!="": + curEdge=copy.copy(graph[otherEnd].active) + #print("Trying to zipper left path from merged cont guide edge:") + #print(curEdge) + else: + #print("nothing left to merge done here") + break + else: + curEdge.start=next2.target + curEdge.target=next1.target + curEdge.dist=next1.dist - curEdge.dist - next2.dist + #print("Merged implied edge:") + #print(curEdge) + removal.append(eAdjust) + #next iteration + continue + else: + print("WTF why removed???") + print(globalVars.contLenDICT[curEdge.start[:-2]]) + print(globalVars.contLenDICT[curEdge.target[:-2]]) + exit() + + #sucess=False + #return sucess, [] + + if firstL==True: + #print("skip first left") + next1 = globalVars.myEdge(edgeList[0].target, edgeList[0].start, edgeList[0].weight, edgeList[0].dist, edgeList[0].stdev, edgeList[0].direct) + firstL=False + else: + next1=copy.copy(graph[curEdge.start].active) + next2=copy.copy(graph[curEdge.target].cEdge) + if next1!="": + next11=copy.copy(graph[next1.target].cEdge) + #print(next1) + #print(next11) + #print(next2) + if curEdge.start == next1.start and curEdge.target == next1.target: + if abs(curEdge.dist - next1.dist) < 3*curEdge.stdev: + sucess=True + break + else: + #print("Egde already exists but distance failed") + next1="" + sucess=False + break + elif next1=="": + #print("End of %s-path reached edge looks good to set"%(zipper)) + edgeList.append(curEdge) + if zipper=='r': + zipper='l' + firstL=True + curEdge=copy.copy(graph[nEdge.target].active) + if curEdge!="": + #print("Checking left path starting with edge") + #print(curEdge) + continue + else: + #print("Original edge had nothing left to merge") + #print(nEdge) + #print(curEdge) + end=True + break + else: + end=True + break + + + #case curEdge+contigEdge of p2 fits in next read edge of p1 + #case1 + #if next1.dist >= curEdge.dist and next1.dist > 0: + if next1.dist >= curEdge.dist: + #adjust gapHASH if new dist > 0 + if (next1.dist-(curEdge.dist+next2.dist)) > (-2)*globalVars.distError: + #print("New dist %i > 0 adjust gapHASH"%(next1.dist-(curEdge.dist+next2.dist))) + adjustGAPdict(next2.target, next1.target, next1.start, next1.target) + #add curEdge to list because it would be set if merging is possible + edgeList.append(copy.copy(curEdge)) + #imply new edge + curEdge.start=next2.target # flip target and start here to continue + curEdge.target=next1.target # flip target and start here to continue + curEdge.dist=next1.dist-(curEdge.dist+next2.dist) + #print("\timplied edge1") + #print(curEdge) + continue + + #case readEdge+ next contigEdge of p1 fits in curEdge + #case2 + #elif curEdge.dist > next1.dist and curEdge.dist > 0: + elif curEdge.dist > next1.dist: + #adjust gapHASH if new dist > 0 + if (curEdge.dist-(next1.dist+next11.dist)) > (-2)*globalVars.distError: + #print("New dist %i > 0 adjust gapHASH"%(curEdge.dist-(next1.dist+next11.dist))) + adjustGAPdict(next11.target, curEdge.target, curEdge.start, curEdge.target) + #imply new edge + curEdge.start=next11.target # no flip needed + curEdge.target=curEdge.target # no flip needed + curEdge.dist=curEdge.dist-(next1.dist+next11.dist) + + #print("\timplied edge2") + #print(curEdge) + continue + + return sucess, edgeList, removal + + # returns an alternating path of edges +def getNodePath(ggraph, anode): + #print("Get node path") + ePath=[] + pHash=[] + curDist=0 + startNode = ggraph.findStart(anode) + curNode=startNode + pHash.append(startNode) + ePath.append(ggraph.cGraph[curNode].cEdge) + curDist+=ggraph.cGraph[curNode].cEdge.dist + curNode=ggraph.cGraph[curNode].cEdge.target + + while ggraph.cGraph[curNode].active != "" and curNode not in pHash: + #print(curNode) + pHash.append(curNode) + ePath.append(ggraph.cGraph[curNode].active) + curNode=ggraph.cGraph[curNode].active.target + #print(curNode) + pHash.append(curNode) + ePath.append(ggraph.cGraph[curNode].cEdge) + curNode=ggraph.cGraph[curNode].cEdge.target + + + pHash.append(curNode) + + return ePath, pHash + + +def checkHappiness(pdict, cDICT): + nH=0 + nUH=0 + #print("Checking happiness of path") + checked=[] + #print(len(pdict)) + for c1 in pdict: + checked.append(c1) + for c2 in pdict: + #print(c1, c2) + # sort both nodes lexicographically to not have 2 different edges between + # the same nodes e.g. one a->b and b->a in the dict + if c1 < c2: + node1 = c1 + node2 = c2 + else: + node1 = c2 + node2 = c1 + + if not c2 in checked: + if node1 in globalVars.edgeDICT: + if node2 in globalVars.edgeDICT[node1]: + dist=cDICT.returnDist(node1, node2) + #print("%s and %s both in list with dist: %i"%(node1, node2, dist)) + curEdges=globalVars.edgeDICT[node1][node2] + #curEdges+=globalVars.edgeDICT[node2][node1] + for curEdge in curEdges: + if curEdge[0] >= globalVars.minLinks: + #print(node1, node2) + #print("comparing distance %i with edge"%(dist)) + #print(curEdge) + if dist != False: + #print("Orientations fit set distance is %i predicted distance %i"%(dist, curEdge[1])) + if abs(curEdge[1] - dist) <= max(3*curEdge[2], 100): #check if within 3*stdev of bundeld edge + #print("Distance fits increasing happy by %i"%(curEdge[0])) + nH+=curEdge[0] + continue + else: + #print("Distance does not fit increasing UNhappy by %i"%(curEdge[0])) + nUH+=curEdge[0] + continue + else: + #print("Incorrect orientation increasing UNhappy by %i"%(curEdge[0])) + nUH+=curEdge[0] + continue + + + return nH, nUH + + +# main function to run gpma +def gpma(newEdge, inGraph): + #print("gpma") + inputEdge=copy.copy(newEdge) + #print(newEdge) + + # check if one edge points to a merged contig --> in that case needs to be + # adjusted + # while loop to go through multiple adjustments + while globalVars.adjustDICT.get(newEdge.start, "nope") != "nope": + #print(inputEdge) + #if globalVars.adjustDICT.get(newEdge.start, "nope") != "nope": + newEdge.dist -= globalVars.adjustDICT[newEdge.start][1] + newEdge.start = globalVars.adjustDICT[newEdge.start][0] + #print(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + adjustGAPdict(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + + #print("edge adjusted to merged edge") + #print(newEdge) + + while globalVars.adjustDICT.get(newEdge.target, "nope") != "nope": + #print(inputEdge) + #if globalVars.adjustDICT.get(newEdge.target, "nope") != "nope": + newEdge.dist -= globalVars.adjustDICT[newEdge.target][1] + newEdge.target = globalVars.adjustDICT[newEdge.target][0] + #print(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + adjustGAPdict(newEdge.start, newEdge.target, inputEdge.start, inputEdge.target) + #print("edge adjusted to merged edge") + #print(newEdge) + + + inputEdge=copy.copy(newEdge) + #start directions + + #print("Path1:") + #ePath1, pHASH1 = inGraph.returnEgdePath(newEdge.start) + ePath1, pHASH1 = getNodePath(inGraph, newEdge.start) + #print(len(pHASH1)) + #for ele in ePath1: + # print(ele) + nHP1, nUHP1 = checkHappiness(pHASH1, inGraph) + #print("Happiness p1: %i/%i(H/UH)"%(nHP1, nUHP1)) + + #print("Path2:") + #ePath2, pHASH2 = inGraph.returnEgdePath(newEdge.target) + ePath2, pHASH2 = getNodePath(inGraph, newEdge.target) + #print(len(pHASH2)) + #for ele in ePath2: + # print(ele) + nHP2, nUHP2 = checkHappiness(pHASH2, inGraph) + #print("Happiness p2: %i/%i(H/UH)"%(nHP2, nUHP2)) + + result, setList, removed = mergeNEW(inputEdge, inGraph) + + if result==True: + oldEdges=[] + for edge in setList: + oE=setEdges(edge, inGraph) + oldEdges=oE+oldEdges + + #print("New path:") + t=True + for ele in removed: + if ele[0][:-2] == newEdge.start[:-2]: + t=False + if t== True: + newPath, newHASH = getNodePath(inGraph, newEdge.start) + else: + newPath, newHASH = getNodePath(inGraph, newEdge.target) + #print(len(newHASH)) + #for ele in newPath: + # print(ele) + newHP, newUHP = checkHappiness(newHASH, inGraph) + + + #check happiness + if newHP - (nHP1+nHP2) >= newUHP - (nUHP1+nUHP2): + #print("GPMA VERY HAPPY") + for remove in removed: + removeMergedNEW(remove[0][:-2]+"_0", remove[1], remove[2]) + removeMergedNEW(remove[0][:-2]+"_1", remove[3], remove[4]) + else: + #print("GPMA UNHAPPY EDGE SHOULD NOT HAVE BEEN SET") + #[print(e) for e in newPath] + #print("Happiness NEW PATH: %i/%i(H/UH)"%(newHP, newUHP)) + #[print(e) for e in ePath1] + #print("Happiness p1: %i/%i(H/UH)"%(nHP1, nUHP1)) + #[print(e) for e in ePath2] + #print("Happiness p2: %i/%i(H/UH)"%(nHP2, nUHP2)) + for edge in oldEdges: + resetEdge(edge, inGraph) + + + + #else: + # print("GPMA FAILED!!!") + + + diff --git a/main.py b/main.py new file mode 100755 index 0000000..83ed55f --- /dev/null +++ b/main.py @@ -0,0 +1,975 @@ +#!/usr/bin/env python3 + +import argparse, subprocess, sys +import statistics +import datetime +import copy +# import own scripts for stuff +import checkHitlist +from gpma import gpma +import globalVars +import splitReads +import merging + +try: + from subprocess import DEVNULL # py3k +except ImportError: + import os # for python2 + DEVNULL = open(os.devnull, 'wb') +# biopython for now --> for reverse complement function +#from Bio.Seq import Seq + + +############################### +#######global variables######## +############################### + + + +# key is readID(without split) --> save orientation for each read to check +# for inversions later +oriHASH = {} + +################################### +#############classes############### +################################### + + +# class for for each hit within the PAF file +class myHIT: + + def __init__(self, query, split, strand, target, t_len, t_start, t_end, q_len, q_start, q_end, mq): + self.query = query + self.split = split + self.strand = strand + self.target = target + self.t_len = t_len + self.t_start = t_start + self.t_end = t_end + self.q_len = q_len + self.q_start = q_start + self.q_end = q_end + self.mq = mq + + def __str__(self): + return("\t%s, %i, %s, %s, %s, %s, %s, %s, %s, %s, %i" % (self.query, self.split, self.strand, self.target, str(self.t_len), str(self.t_start), str(self.t_end), str(self.q_len), str(self.q_start), str(self.q_end), self.mq)) + # comparing edges will compare their weights + + def __lt__(self, other): + return self.q_start < other.q_start + + def __gt__(self, other): + return self.q_start > other.q_start + +# class for outputting scaffolds in the end +class mySCAF: + + def __init__(self, names, seqs): + self.names = names + self.seqs = seqs + + + +################################### +###functions####################### +################################### + +# function to parse the config file containing the paths to +# blastn and minimap2 +def parsePATHs(): + scriptPath=sys.argv[0].rstrip("main.py") + configReader=open("%s/config.txt"%scriptPath) + minimap2Path="" + blastnPath="" + for line in configReader: + if not line.startswith("#"): + if line.startswith("MINIMAP2"): + minimap2Path = line.strip().split("=")[-1] + elif line.startswith("BLASTN="): + blastnPath = line.strip().split("=")[-1] + if minimap2Path=="": + print("Error no entry for minimap2 path in config.txt found - exiting...") + exit() + if blastnPath=="": + print("Error no entry for blastn path in config.txt found - exiting...") + exit() + configReader.close() + #print("minimap path: %s"%(minimap2Path)) + #print("blastn path: %s"%(blastnPath)) + + return minimap2Path, blastnPath + +#function to reverse complement a sequence +def reverseComplement(seq): + old = "ACGT" + replace = "TGCA" + return seq.upper().translate(str.maketrans(old, replace))[::-1] + +# function to run minimap2 +def runMinimap2(nThreads, mapMode, inFA, inFQ, outPAF): + scriptPath=sys.argv[0].rstrip("main.py") + try: + subprocess.call("%s/minimap2/minimap2 -t %i -x %s %s %s> %s" % + (scriptPath, nThreads, mapMode, inFA, inFQ, outPAF), shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + #subprocess.call("%s/minimap2/minimap2 -t %i -x %s %s %s> %s" % + # (scriptPath, nThreads, mapMode, inFA, inFQ, outPAF), shell=True) + except: + print("Problem running minimap2 - check if its properly set up") + +# function to parse contigs and initialize the contigGraph +def parseContigs(path, cograph): + if path.endswith(".gz"): + import gzip + faParser = gzip.open(path, mode='rt') + else: + faParser = open(path) + # for first iteration/line + seq = "" + for line in faParser: + if line.startswith(">"): + if seq != "": + seqLen = len(seq) + # initalize contigGraph + cograph.addContigEdge(contID, seqLen) + # initalize contLenDICT + globalVars.contLenDICT[contID] = seqLen + globalVars.contDICT[contID] = seq + globalVars.insE[contID]=[] # insertions + globalVars.invE[contID]=[] # inversions + globalVars.delE[contID]=[] # deletions + globalVars.mjE[contID]=[] # misjoins + globalVars.goodE[contID]=[] # continuous mapping hits + if seqLen>=globalVars.largestContig[1]: + globalVars.largestContig=(contID, seqLen) + + contID = line.strip().split(" ")[0][1:] + seq = "" + else: + seq += line.strip() + + faParser.close() + + # for last sequence not covered by loop + seqLen = len(seq) + # initalize contigGraph + cograph.addContigEdge(contID, seqLen) + # initalize globalVars.contLenDICT + globalVars.contLenDICT[contID] = seqLen + globalVars.contDICT[contID] = seq + globalVars.insE[contID]=[] + globalVars.invE[contID]=[] + globalVars.delE[contID]=[] + globalVars.mjE[contID]=[] + globalVars.goodE[contID]=[] + + + +#function to bundle a set of edges between two nodes +#as described in gpma +def bundleEdges(edgeSet, noMed=False): + #print("BUNDELING:") + #print(edgeSet) + nm=False + bundledList=[] + if noMed ==True: # if median picked before clusered nothing take first edge + #print("Taking median failed before so now just pick first edge") + medianEdgeLen=edgeSet[0] + else: + medianEdgeLen = statistics.median(edgeSet) #calc median + #stDev=numpy.std(edgeSet) + #print("Median edge chosen: %i"%(medianEdgeLen)) + #print(stDev, globalVars.distError*2) + newWeight=0 + fitDists=[] + #if edgeSet only has length one to avaoid unnecessary computations + if len(edgeSet) == 1 and globalVars.minLinks<=1: + bundledList=[(1, edgeSet[0], 0)] + edgeSet.remove(edgeSet[0]) + + else: + for dist in edgeSet: + #print(abs(medianEdgeLen - dist), 2*globalVars.distError) + if abs(medianEdgeLen - dist) < 2*globalVars.distError: + newWeight+=1 + fitDists.append(dist) + #print("dist %i fit and was added and removed!"%dist) + # important to remove after loop otherwise it causes the error of skipping some edges + for dist in fitDists: + edgeSet.remove(dist) + if len(fitDists) > 1: + # bundle edges as defined in GPMA + p=sum([dist/((globalVars.distError*2)**2) for dist in fitDists]) + q=sum([1/((globalVars.distError*2)**2) for dist in fitDists]) + newDist=int(p/q) + newSDev=statistics.stdev(fitDists) + newEdge=(newWeight, newDist, newSDev) + #print("bundeled new edge added:") + #print(newEdge) + #print(fitDists) + bundledList.append(newEdge) + elif len(fitDists) == 0: + #print("Bundeling failed") + nm=True + #if globalVars.minLinks<=1: + # bundledList+=[(1, edge, 0) for edge in edgeSet] + # edgeSet=[] + + elif len(fitDists) == 1 and globalVars.minLinks<=1: + newDist=fitDists[0] + bundledList.append((1, fitDists[0], 0)) + + + #newEdge = (edgeWeight, medianEdgeLen) + + + # repeat if still unbundled edges left + if len(edgeSet)>0: + #print("recall bundeling there is still some edges left") + bundledList+=bundleEdges(edgeSet, nm) + #print("\n\n") + return(bundledList) + + +# parses PAF gets hitlists and checks them --> builds edges and bundles them +# returns set of bundled edges +def parsePAF(path): + if path.endswith(".gz"): + import gzip + pafParser = gzip.open(path) + else: + pafParser = open(path) + + # list for all hit of one reads --> e.g. best hit per split part + hitList = [] + posCounts = 0 + revCounts = 0 + + for line in pafParser: + splitted = line.strip().split("\t") + query = splitted[0] + q_len = int(splitted[1]) + q_start = int(splitted[2]) + q_end = int(splitted[3])-1 + strand = splitted[4] + tar = splitted[5] + t_len = int(splitted[6]) + t_start = int(splitted[7]) + t_end = int(splitted[8])-1 + #matching = splitted[9] + #opts = splitted[10] + mq = int(splitted[11]) + #P for primary aln or S for secondary + #alnType=splitted[12].split(":")[-1] + + # get fragment ID + qID = query.split("_")[0] + splitID = int(query.split("_")[-1]) + + curHIT = myHIT(qID, splitID, strand, tar, t_len, + t_start, t_end, q_len, q_start, q_end, mq) + + # take only the best hit for each fragment --> replace if the last one was + + if q_end-q_start>q_len*0.3: + if len(hitList)==0: + hitList.append([curHIT]) + continue + if curHIT.query == hitList[-1][0].query: + #if curHIT.split == lastHit.split: + + if curHIT.split == hitList[-1][0].split: + #print("Mult hit found") + #print(lastHit) + #print(curHIT) + #identical hit + hitList[-1].append(curHIT) + else: + hitList.append([curHIT]) + if curHIT.strand == "+": + posCounts += 1 + elif curHIT.strand == "-": + revCounts += 1 + + else: + # more than 1 hit --> only then it makes sense to check the + # hitlist + if len(hitList) > 1: + #check the hitlist to identify SVs misjoins and edges + checkHitlist.checkHitlist(hitList, globalVars.edgeDICT, oriHASH) + + # start hit list for new read and reset repCounter + hitList = [] + if q_end-q_start>=q_len*0.3: + hitList.append([curHIT]) + if curHIT.strand == "+": + posCounts += 1 + elif curHIT.strand == "-": + revCounts += 1 + + #for last hitlist + checkHitlist.checkHitlist(hitList, globalVars.edgeDICT, oriHASH) + + pafParser.close() + + + print("\t2.1) Bundling Edges") + edgeLIST = [] + # bundleEdges + for cont in globalVars.edgeDICT: + for tCont in globalVars.edgeDICT[cont]: + bundeledEdges=bundleEdges(globalVars.edgeDICT[cont][tCont]) + #print(bundeledEdges) + globalVars.edgeDICT[cont][tCont] = bundeledEdges + + # ignore edges with only weight 1 + acceptedEdges=[] + for edge in globalVars.edgeDICT[cont][tCont]: + + #print(edge) + if edge[0] >= globalVars.minLinks: + + #check if overlap so large it indicates a merge + if edge[1]*-1 >= globalVars.contLenDICT[cont[:-2]] or edge[1]*-1 >= globalVars.contLenDICT[tCont[:-2]]: + #print("overlap so large it indicates a merge") + if not edge[1]*-1 > 50000: + merged, realOL, mAdjust = merging.runBLAST(cont, tCont, edge[1], True) + if merged=="Removed": + print("Merging passed") + #removeMergedNEW(remove[0][:-2]+"_0", remove[1], remove[2]) + #removeMergedNEW(remove[0][:-2]+"_1", remove[3], remove[4]) + #else: + # print("Merging failed") + else: + + newEdge = globalVars.myEdge(cont, tCont, edge[0], edge[1], edge[2], 'r') + acceptedEdges.append(edge) + ins = False + for i, e in enumerate(edgeLIST): + if e < newEdge: + edgeLIST.insert(i, newEdge) + ins = True + break + + if ins == False: + edgeLIST.append(newEdge) + + #print("Accepted edges between %s and %s"%(cont, tCont)) + #for e in acceptedEdges: + # print(e) + #print("\n") + globalVars.edgeDICT[cont][tCont]=acceptedEdges + + return edgeLIST + + +# running GPMA from gpma.py +def runGPMA(orderedEdges, cGRAPH): + writer=open(globalVars.outDir + "/tmp_edges", "w") + for i, edge in enumerate(orderedEdges): + #print("Running GPMA trying to set edge (%i/%i):"%(i, len(orderedEdges)-1)) + #print(edge) + gpma(edge, cGRAPH) + writer.write(str(edge)+"\n") + writer.close() + +#function to cluster variants per contig +def clusterSVs(svList, cMap, writer, ms, misJoin=False, deletion=False): + #to avoid unnecessary checking + if len(svList) < 2: + return [] + + breaks=[] + import networkx as nx + G = nx.Graph() + nINS=len(svList) + #print(svList) + for i, sv1 in enumerate(svList): + for j in range(i+1, nINS): + sv2 = svList[j] + #print(sv1, sv2) + sd = abs((sv1[2]-sv1[1]) - (sv2[2]-sv2[1])) / max((sv1[2]-sv1[1]), (sv2[2]-sv2[1]), 1) + pd = min(abs(sv1[1]-sv2[1]), abs(sv1[2]-sv2[2]), abs((sv1[2]+sv1[1])/2 - (sv2[2]+sv2[1])/2)) + #print(sd, pd) + spd=sd+pd/2000 + #print(spd) + if spd < 0.7: + #print("CLUSTER TOGETHER") + G.add_edge(i, j) + #print("\n") + insClusts=nx.find_cliques(G) + #print("nClusters found:") + for iClust in insClusts: + #print(len(iClust)) + #print(iClust) + if len(iClust) > 2: + svB=[] + svE=[] + svW=0 + scaf="" + for i in iClust: + #print(svList[i]) + ins = svList[i] + scaf=ins[0] + svB.append(ins[1]) + svE.append(ins[2]) + svW+=1 + #print(ins) + + + #print(svB, svE) + + if misJoin==True: + svB=int(statistics.mean(svB)) + svE=int(statistics.mean(svE)+100) + if svE >= len(cMap): + if svB >= len(cMap): + continue + svE=len(cMap)-1 + #print(svB, svE) + meanGood=round(statistics.mean(cMap[svB:svE+1])) + meanSTD=round(statistics.stdev(cMap[svB:svE+1])) + meanGood+=(-3)*meanSTD + else: + svB=int(statistics.mean(svB)) + svE=int(statistics.mean(svE)) + + meanGood=round(statistics.mean(cMap[svB:svE+1])) + if deletion==True: + svW+=meanGood + #print(svB, svE, svW, meanGood, svW/(svW+meanGood)) + if svW > 2 and svW/(svW+meanGood)>=0.7: + #print("ACCEPT SV!!!") + #print(cMap[svB:svE+1]) + #print(svB, svE, svW, meanGood, svW/(svW+meanGood)) + #breaks.append((svB, svE)) + breaks.append(svB) + breaks.append(svE) + writer.write("%s\t%i\t%i\t%i\t%i\t%f\n"%(scaf ,svB, svE, svW, meanGood, svW/(svW+meanGood))) + cMap[svB]-=svW + if svB!=svE: + cMap[svE]-=svW + + + G.clear() + + return breaks + +# function to break contigs given a +def breakConts(bHASH, cit): + oWriter=open(args.outdir+"/"+"adjusted_contigs_it%i.fa"%(cit), "w") + for scaf in globalVars.contDICT: + if scaf in bHASH: + curStart=0 + counter=1 + breakEMargin=500 + for error in bHASH[scaf]: + #print(curStart, error, error-breakEMargin) + #only output fragments larger than 1000bp + if len(globalVars.contDICT[scaf][curStart:error-breakEMargin]) > 1000: + oWriter.write(">%s_%i\n"%(scaf, counter)) + #print("Write splitted scaf: %s from: %i-%i" %(scaf, curStart, error-breakEMargin)) + oWriter.write("%s\n"%(globalVars.contDICT[scaf][curStart:error-breakEMargin])) + + curStart=error+breakEMargin + counter+=1 + if len(globalVars.contDICT[scaf][curStart:]) > 1000: + oWriter.write(">%s_%i\n"%(scaf, counter)) + #print("Write splitted scaf: %s from: %i-%i" %(scaf, curStart, len(globalVars.contDICT[scaf]))) + oWriter.write("%s\n"%(globalVars.contDICT[scaf][curStart:])) + else: + oWriter.write(">%s\n"%(scaf)) + oWriter.write("%s\n"%(globalVars.contDICT[scaf])) + oWriter.close() + +# function that generates the pos. coverage map per contig +# and then runs clusterSVs on the different SVs for that contig +# and then checks if any contigs must be broken (breakConts) +def checkContigsNEW(curIt, noBreak): + svbreak=False + covWriter=open(globalVars.outDir + "/coverage_map_it%i.txt"%(curIt), "w") + insWriter=open(globalVars.outDir + "/insertions_it%i.txt"%(curIt), "w") + invWriter=open(globalVars.outDir + "/inversions_it%i.txt"%(curIt), "w") + delWriter=open(globalVars.outDir + "/deletions_it%i.txt"%(curIt), "w") + mjWriter=open(globalVars.outDir + "/misjoins_it%i.txt"%(curIt), "w") + svBreaks={} + for scaf in globalVars.goodE: + #print("Checking scaf %s"%(scaf)) + covMap=[0]*globalVars.contLenDICT[scaf] + breakpoints=[] + for g in globalVars.goodE[scaf]: + for i in range(g[1],g[2]+1): + covMap[i]+=1 + meanSupport=statistics.median(covMap[50:-50]) + #print("cont %s has a mean goodness of scaf %f"%(scaf, meanSupport)) + #print(covMap) + #print("INSERTIONS:") + breakpoints+=clusterSVs(globalVars.insE[scaf], covMap, insWriter, meanSupport) + #print("DELETIONS:") + breakpoints+=clusterSVs(globalVars.delE[scaf], covMap, delWriter, meanSupport, deletion=True) + #print("INVERSIONS:") + breakpoints+=clusterSVs(globalVars.invE[scaf], covMap, invWriter, meanSupport) + #print("MISJOINS:") + breakpoints+=clusterSVs(globalVars.mjE[scaf], covMap, mjWriter, meanSupport, misJoin=True) + + if breakpoints!=[]: + svbreak=True + svBreaks[scaf]=breakpoints + + checkCovMap(scaf, covMap, covWriter) + + + if svbreak==True and noBreak==False: + breakConts(svBreaks, curIt) + + covWriter.close() + insWriter.close() + delWriter.close() + invWriter.close() + mjWriter.close() + + return svbreak + +# outputs the coverage map +def checkCovMap(curCont, cmap, oWriter): + oWriter.write(">%s\n"%(curCont)) + totalBps=0 + nGood=0 + nBad=0 + nUnknown=0 + curRange=["", "", 0] + + for pos, cov in enumerate(cmap): + totalBps+=1 + if cov>0: + nGood+=1 + elif cov==0: + nUnknown+=1 + else: + nBad+=1 + if pos==0: + curRange[0]=pos + curRange[1]=pos + curRange[2]=cov + else: + if curRange[2]==cov: + curRange[1]=pos + else: + oWriter.write("\t%s\t%i\t%i\t%i\n"%(curCont, curRange[0], curRange[1], curRange[2])) + curRange[0]=pos + curRange[1]=pos + curRange[2]=cov + oWriter.write("\t%s\t%i\t%i\t%i\n"%(curCont, curRange[0], curRange[1], curRange[2])) + + +# function to write a scaffold for final output +def writeScaf(sSeqs, sNames, oNR, sWriter, gffWriter): + #Write output + newSeq = "" + #print("writing out") + + + for i, name in enumerate(sNames): + seq = sSeqs[i] + start=len(newSeq)+1 + newSeq += seq + end=len(newSeq) + if name.startswith("read_"): + gffWriter.write("scaf_%i\t.\tread(%s)\t%i\t%i\n"%(oNR, name, start, end)) + else: + gffWriter.write("scaf_%i\t.\tcontig(%s)\t%i\t%i\n"%(oNR, name, start, end)) + + + scafLen = len(newSeq) + sWriter.write(">scaf_%i(len=%i)\n" % (oNR, scafLen)) + # print(newName) + # print(newSeq) + myIndex = 0 + # fasta format capped at 80bps per line + while (myIndex + 80) <= len(newSeq): + sWriter.write("%s\n" % (newSeq[myIndex:myIndex + 80])) + myIndex += 80 + sWriter.write("%s\n" % (newSeq[myIndex:])) + + return scafLen + + + +# function to output all contigs/scaffolds in the end --> gets the path +# for each and then puts them together before writing out +def getPATH(myGraph): + scafWriter = open(globalVars.outDir + "/scaffolds.fasta", "w") + scafCount = 0 + idWriter = open(globalVars.outDir + "/conts_scafs.ids", "w") + gffWriter = open(globalVars.outDir + "/scaffolds.gff", "w") + # hash to keep track of already visited contigs + cLens=[] + visited = {} + count=1 + #print(len(myGraph.cGraph)) + for cont in myGraph.cGraph: + #print("Checking cont %s - %i/%i"%(cont, count, len(myGraph.cGraph))) + count+=1 + if not cont[:-2] in visited: + startNode = myGraph.findStart(copy.copy(cont)) + #print(myGraph.getPath(copy.copy(cont))) + first=True + #print("Getting path for cont %s start of path is at: %s"%(cont, startNode)) + + newScaf = mySCAF([], []) # intitalize output scaf + idWriter.write(">scaf_%i\n"%(scafCount)) + curNode=copy.copy(startNode) + while 1: + #print("while curNode=%s"%(curNode)) + # check if already visited + if not curNode[:-2] in visited: + visited[curNode[:-2]]=1 + else: + #print("CIRCLE FOUND") + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + visited[curNode[:-2]] = 1 + #print("visited %s"%(curNode[:-2])) + + #case for startNode set dist to 0 + if first==True: + first=False + offset=0 + else: + offset = myGraph.cGraph[curNode].active.dist *(-1) # change pos/neg to get offset for contigSeq + if myGraph.cGraph[curNode].active != "": + idWriter.write("\tdist=%i, links=%i\n"%(offset*-1, myGraph.cGraph[curNode].active.weight)) + else: + idWriter.write("\tdist=%i\n"%(offset*-1)) + + + + # get scaf seq in correctOrientation + if curNode[-1]=="0": + contSeq = globalVars.contDICT[curNode[:-2]] + elif curNode[-1]=="1": # reverse complement seq + #contSeq=str(Seq(globalVars.contDICT[curNode[:-2]]).reverse_complement()) + contSeq=reverseComplement(globalVars.contDICT[curNode[:-2]]) + + + idWriter.write("\t%s(len=%i)\n"%(curNode, globalVars.contLenDICT[curNode[:-2]])) + + #fill gap + if offset < 0: # gap that must be filled exists + #print("Filling gap between %s and %s"%(myGraph.cGraph[curNode].active.target, curNode)) + fillID, fillSeq = fillEDGEgap(myGraph.cGraph[curNode].active.target, curNode, newScaf.seqs[len(newScaf.seqs)-1], contSeq, offset*-1) + #print("Filler = %s len=%i"%(fillID, len(fillSeq))) + + if fillID=="100N": + #print("Dont set edge") + del visited[curNode[:-2]] + myGraph.cGraph[curNode].active="" + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + else: + newScaf.names.append("read_"+fillID) + newScaf.seqs.append(fillSeq) + + offset = 0 + + #print("appending cont %s with offset %i"%(curNode, offset)) + newScaf.names.append(curNode) + #print("appending seq of len=%i"%(len(contSeq[offset:]))) + newScaf.seqs.append(contSeq[offset:]) + + curNode = myGraph.cGraph[curNode].cEdge.target + + if myGraph.cGraph[curNode].active=="": + #print("End of path reached") + curLen=writeScaf(newScaf.seqs, newScaf.names, scafCount, scafWriter, gffWriter) + cLens.append(curLen) + scafCount+=1 + idWriter.write("\n") + break + else: + #print(myGraph.cGraph[curNode].active) + curNode = myGraph.cGraph[curNode].active.target + #print("Next in path is %s"%(curNode)) + #print(myGraph.cGraph[curNode].active) + + scafWriter.close() + idWriter.close() + gffWriter.close() + return cLens + + +# function to fill the gap between to contigs with a pacbio read +def fillEDGEgap(cont1, cont2, cSeq1, cSeq2, gLen): + gapFilled=False + #print("Trying to fill gap between %s and %s "%(cont1, cont2)) + #sort contigs to get same order as before for edge connection + if cont1 < cont2: + node1 = cont1 + node2 = cont2 + else: + node1 = cont2 + node2 = cont1 + #print(node1, node2) + gapReads=globalVars.gapDICT[node1][node2] + + if len(cSeq1)>1000: + adjustedSeq1=cSeq1[-1000:] + adjustedLen1=1000 + else: + adjustedSeq1=cSeq1 + adjustedLen1=len(cSeq1) + if len(cSeq2)>1000: + adjustedSeq2=cSeq2[:1000] + adjustedLen2=1000 + else: + adjustedSeq2=cSeq2 + adjustedLen2=len(cSeq2) + + for read in gapReads: + #print("\tUsing read %s" %(read)) + rSeq=globalVars.readHash[read] + gapFilled=merging.fillGAPS(cont1, cont2, adjustedLen1, adjustedLen2, adjustedSeq1, adjustedSeq2, read, len(rSeq), rSeq) + + #if gap can be filled abort gapFilling process + if gapFilled!=False: + break + + if gapFilled==False: + read="100N" + gapFilled="N"*100 + #print("GAP could not be filled --> add 100 Ns in between???!!!") + #print(cont1, cont2) + + return read, gapFilled + +# calculates the N-statistics (N10,..,N50,..,N100) +def calcNstats(scafLens, totalLen): + Nvals=[(0,0)]*10 + n=0 + curSum=0 + curN=0.1 + curPos=0 + for slen in scafLens: + curSum+=slen + #print(curN, curPos, curSum, n, slen) + n+=1 + curThresh=totalLen*curN + #print("cur threshold %f"%curThresh) + if curSum >= curThresh: + curN=round(curN+0.1, 2) + Nvals[curPos]=(slen, n) + curPos+=1 + + return(Nvals) + +# function to generate and ouput the scaffolds statistics +def scafStats(sLens): + writer=open(globalVars.outDir + "/scaffolds.stats", "w") + sortedLens=sorted(sLens, reverse=True) + totalBps=sum(sLens) + totalScafs=len(sLens) + avgLen=statistics.mean(sLens) + sortedLens=sorted(sLens, reverse=True) + + writer.write("Total number basepairs in assembly: %i\n"%(totalBps)) + writer.write("Total number of scaffolds: %i\n"%(totalScafs)) + writer.write("Longest scaffold: %i\n"%(sortedLens[0])) + writer.write("Average scaffold length: %i\n"%(avgLen)) + + nVs=calcNstats(sortedLens, totalBps) + + names=["N10","N20","N30","N40","N50","N60","N70","N80","N90","N100"] + for i, nv in enumerate(nVs): + writer.write("%s = %i, n = %i\n"%(names[i], nv[0], nv[1])) + + writer.close() + +# function to remove all tmp files created while running the script +def removeTMPfiles(): + subprocess.call("rm %s/tmp*" % (globalVars.outDir), shell=True) + + + +########################################################################### +###########################RUN EVERYTHING################################## +def run(genome, nobreak, curIter): + # create output directory if it not exists already + subprocess.call("mkdir -p %s" % (args.outdir), shell=True) + + miniPATH, blPATH = parsePATHs() + + # initlaiye global variables + if curIter > 0: + tempStore=globalVars.readHash + globalVars.init(args.outdir, args.size, args.minlinks, args.minoverlap, args.minident, args.threads, args.minSVLen, blPATH, miniPATH) + + if curIter>0: + globalVars.readHash=tempStore + tempStore="" + contigGraph=globalVars.coGRAPH() + + print("0a) Parsing Contigs") + parseContigs(genome, contigGraph) + + + t0=datetime.datetime.now() + splitFile = args.outdir + "/tmp_%ibp_splits" % (args.size) + if curIter == 0: + print("\n0b) Splitting reads into %ibp chunks" % (args.size)) + if args.fasta: + splitReads.splitReadsFA(args.fastq, splitFile, args.size) + else: + splitReads.splitReadsFQ(args.fastq, splitFile, args.size) + t1=datetime.datetime.now() + splittime=t1-t0 + + print("\n1) Running minimap2") + # name for output file + t0=datetime.datetime.now() + myPAF = globalVars.outDir + "/tmp_it%i.paf"%(curIter) + runMinimap2(args.threads, mapMode, genome, splitFile, myPAF) + t1 = datetime.datetime.now() + maptime=t1-t0 + + print("\n2) Parsing PAF") + t0=datetime.datetime.now() + edgeSet = parsePAF(myPAF) + t1=datetime.datetime.now() + parseTime=t1-t0 + + #avgCOV=statistics.median(globalVars.largestMapped) + #print(globalVars.largestMapped) + #print("\tCalculated average coverage: %i"%(avgCOV)) + + print("\n3) Clustering and identification of SVs and misjoins") + t0 = datetime.datetime.now() + restart=checkContigsNEW(curIter, nobreak) + t1 = datetime.datetime.now() + svTime=t1-t0 + #restart, contStats = checkContigs(curIter, nobreak) + if restart==True and nobreak==False: + print("\tContigs were adjusted restart needed!!!") + return False + else: + + # check if contigs need to be readjusted if so --> readjust and rerun + # mapping before running GPMA + print("\n4) Running GPMA") + print("\tNumber of edges: %i" %(len(edgeSet))) + t0 = datetime.datetime.now() + runGPMA(edgeSet, contigGraph) + t1 = datetime.datetime.now() + gpmaTime=t1-t0 + + # get the paths between contigs for output + print("\n5) Outputting scaffolds and filling gaps") + t0 = datetime.datetime.now() + scafLengths=getPATH(contigGraph) + t1 = datetime.datetime.now() + getPathTime=t1-t0 + + haploWriter = open(globalVars.outDir + "/duplicates.fasta", "w") + print("\n6) Outputting unscaffolded duplicates") + #output haplotigs + visited={} + for ele in globalVars.adjustDICT: + sID=ele[:-2] + if visited.get(sID, "nope")==1: + continue + + contSeq = globalVars.contDICT[sID] + haploWriter.write(">%s\n%s\n"%(sID, contSeq)) + + visited[sID]=1 + haploWriter.close() + + + #get stats + scafStats(scafLengths) + + + # cleaning up: + print("\n7) Cleaning up - removing tmp files") + removeTMPfiles() + + + print("\n\n") + + etime = datetime.datetime.now() + totaltime = etime - stime + #print(globalVars.contsMerged) + print("Total elapsed time: " + str(totaltime) + " [h:min:sec:millisec]") + print("0) Splitting time: " + str(splittime) + " [h:min:sec:millisec]") + print("1) Mapping time(minimap2): " + + str(maptime) + " [h:min:sec:millisec]") + print("2) Parsing time: " + str(parseTime) + " [h:min:sec:millisec]") + print("2) SV clustering time: " + str(svTime) + " [h:min:sec:millisec]") + print("4) GPMA time: " + str(gpmaTime) + " [h:min:sec:millisec]") + #print("\tTotal BLASTS done: %i"%(globalVars.blastCounter)) + print("5) GetPath time: " + str(getPathTime) + " [h:min:sec:millisec]") + + + return True + + + +########################################################################## +########################################################################## +########################################################################## +parser = argparse.ArgumentParser(description="") +parser.add_argument( + "-ge", "--genome", help="Contigs in fasta format", type=str, required=True) +parser.add_argument( + "-fq", "--fastq", help="Reads in fastq format", type=str, required=True) +parser.add_argument( + "-out", "--outdir", help="Directory for output", type=str, required=True) +parser.add_argument( + "-m", "--mode", help="Mapping mode in minimap2 for different types of input data data. Options: pb/ont [default=pb]", type=str, default="pb") +parser.add_argument( + "-t", "--threads", help="Threads to use for minimap2 only [default=4]", type=int, default=4) +parser.add_argument( + "-ml", "--minlinks", help="Minimum reads supporting a link needed to set an edge between two contigs [default=2]", type=int, default=2) +parser.add_argument( + "-mo", "--minoverlap", help="Minimum overlap needed for two contigs to be merged [default=50]", type=int, default=50) +parser.add_argument( + "-mi", "--minident", help="Minimum identity in overlap needed for two contigs to be merged [default=85.0]", type=float, default=85.0) +parser.add_argument( + "-it", "--iterations", help="maximal iterations performed for breaking set to 0 to perfrom NO breaking", type=int, default=2) +parser.add_argument( + "-msvl", "--minSVLen", help="Miniumum length of a structural variant (insertion, deletion, inversion) to break a contig for [default=1000]", type=int, default=1000, choices=range(500, 10001), metavar="[500, 10000]") +parser.add_argument( + "-s", "--size", help="Size to split the input reads into (in bps) [default=1000]", type=int, default=1000) +parser.add_argument( + "-fa", "--fasta", help="Enable if input reads are in fasta format", action="store_true") +args = parser.parse_args() + + +########################################################################## +if __name__ == "__main__": + # starttime + stime = datetime.datetime.now() + # get right mode + if args.mode == "pb": + mapMode = 'map-pb' + elif args.mode == "ont": + mapMode = 'map-ont' + + + genome=args.genome + curIt=0 + mode=False + while curIt <= args.iterations: + print("---------------------------------------------ITERATION %i---------------------------------------------"%(curIt)) + if curIt == args.iterations: + mode=True + end=run(genome, mode, curIt) + genome=args.outdir+"/"+"adjusted_contigs_it%i.fa"%(curIt) + curIt+=1 + if end == True: + curIt=args.iterations+1 + print("\n\n") \ No newline at end of file diff --git a/merging.py b/merging.py new file mode 100644 index 0000000..ec236e7 --- /dev/null +++ b/merging.py @@ -0,0 +1,472 @@ +import globalVars +import gpma +import subprocess, datetime +try: + from subprocess import DEVNULL # py3k +except ImportError: + import os # for python2 + DEVNULL = open(os.devnull, 'wb') + + +#function to reverse complemet a sequence +def reverseComplement(seq): + old = "ACGT" + replace = "TGCA" + return seq.upper().translate(str.maketrans(old, replace))[::-1] + + +# function to run a blastn comparison between to contigs +def runBLAST(id1, id2, eOL, checkOL): + + #print("Merging %s and %s with estimasted OL:%i" % (id1, id2, estOL)) + seq1=globalVars.contDICT[id1[:-2]] + seq2=globalVars.contDICT[id2[:-2]] + seq1_trimmed=seq1 + seq2_trimmed=seq2 + if eOL*-1 >= len(seq1): + + if id2.endswith("1"): + seq2_trimmed=seq2[len(seq2)-eOL*-1 : (len(seq2)-eOL*-1)+len(seq1)] + elif id2.endswith("0"): + seq2_trimmed=seq2[(eOL*-1)-len(seq1) : eOL*-1] + if eOL*-1 >= len(seq2): + + if id1.endswith("1"): + seq1_trimmed=seq1[len(seq2)-eOL*-1 : (len(seq2)-eOL*-1)+len(seq2)] + elif id1.endswith("0"): + seq1_trimmed=seq1[(eOL*-1)-len(seq2) : eOL*-1] + + + + ''' + if checkOL==True: + #only blast overlaps against each other + #if contig longer than overlap + if len(seq1) > estOL*(-1)+500: + if id1.endswith("_1"): + seq1_trimmed=seq1[int(len(seq1)+(estOL-500)):] + elif id1.endswith("_0"): + seq1_trimmed=seq1[:int(estOL*(-1)+500)] + else: + print("WTF is this name - %s"%(id1)) + else: + seq1_trimmed=seq1 + checkOL=False + + if len(seq2) > estOL*(-1)+500: + if id2.endswith("_1"): + seq2_trimmed=seq2[int(len(seq2)+(estOL-500)):] + elif id2.endswith("_0"): + seq2_trimmed=seq2[:int(estOL*(-1)+500)] + else: + print("WTF is this name - %s"%(id2)) + else: + seq2_trimmed=seq2 + checkOL=False + else: + seq1_trimmed=seq1 + seq2_trimmed=seq2 + ''' + + tmpDIR = globalVars.outDir + + #print(len(seq1), len(seq1_trimmed)) + #print(len(seq2), len(seq2_trimmed)) + oWriter = open(tmpDIR + "/tmp1.fa", "w") + #oWriter.write(">%s\n%s\n" % (id1, seq1)) + oWriter.write(">%s\n%s\n" % (id1, seq1_trimmed)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp2.fa", "w") + #oWriter.write(">%s\n%s\n" % (id2, seq2)) + oWriter.write(">%s\n%s\n" % (id2, seq2_trimmed)) + oWriter.close() + + # define blastn output columns + outfmt = "'6 qseqid sseqid pident length qstart qend sstart send sstrand evalue bitscore'" + stime = datetime.datetime.now() + #nod threads --> 'num_threads' is currently ignored when 'subject' is specified. + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp1.fa", tmpDIR + "/tmp2.fa", tmpDIR + "/tmp.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + etime = datetime.datetime.now() + #print("Blast Done time: " + str(etime-stime) + " [h:min:sec:millisec]") + #increase BLAST Counter + globalVars.blastCounter+=1 + + + # length need to be this way around --> 2 first then 1 beacuse 1 was used + # as reference + sucess, rOL, mAdjust = tryMERGE(tmpDIR + "/tmp.blast6", id1, id2, len(seq1_trimmed), len(seq2_trimmed)) + + #if (rOL*(-1)==len(seq1_trimmed)-10 or rOL*(-1)==len(seq2_trimmed)-10) and checkOL==True: + # print("Full overlap rerun with more sequence!!!") + # return runBLAST(id1, id2, eOL, False) + + return sucess, rOL, mAdjust + + +# function for merging contigs by reading the blastn output +def tryMERGE(blastPATH, cID1, cID2, len1, len2): + blastReader = open(blastPATH) + + mergeAdjust="" + #just a initialization beacuse it needs a return value + alnLen=1 + + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO MERGING CAN BE DONE!!!") + merged = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + #print(line) + splitted = line.strip().split("\t") + # s2 first beacause s1 was used as reference + s2 = splitted[0] + s1 = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + s2_start = int(splitted[4]) - 1 + s2_end = int(splitted[5]) - 1 + s1_start = int(splitted[6]) - 1 + s1_end = int(splitted[7]) - 1 + s1_strand = splitted[8] + + #print("\tc1:%s len=%i ostart=%i oend=%i" % + # (cID1, len1, s1_start, s1_end)) + #print("\tc2:%s len=%i ostart=%i oend=%i" % + # (cID2, len2, s2_start, s2_end)) + + # to merge as input argument + if alnLen >= globalVars.minOverlap and ident >= globalVars.minIdent: + + #first check if alnLen is equal to len of one contig which means its fully part of the other and can be removed + if alnLen >= len1-10: + #print("Contig %s is fully part of contig %s and will be removed!!!"%(cID1, cID2)) + if s1_start < s1_end: + if s2_start < s2_end: + nTar0=cID2[:-2]+"_0" + aDist0=s2_start + nTar1=cID2[:-2]+"_1" + aDist1=len2-s2_end + elif s2_start > s2_end: + nTar0=cID2[:-2]+"_1" + aDist0=len2-s2_end + nTar1=cID2[:-2]+"_0" + aDist1=s2_start + elif s1_start > s1_end: + if s2_start < s2_end: + nTar0=cID2[:-2]+"_1" + aDist0=len2-s2_end + nTar1=cID2[:-2]+"_0" + aDist1=s2_start + elif s2_start > s2_end: + nTar0=cID2[:-2]+"_0" + aDist0=s2_start + nTar1=cID2[:-2]+"_1" + aDist1=len2-s2_end + + #removeMerged(globalVars.contigGR, cID1[:-2]+"_0", nTar0, aDist0) + #removeMerged(globalVars.contigGR, cID1[:-2]+"_1", nTar1, aDist1) + mergeAdjust=(cID1, nTar0, aDist0, nTar1, aDist1) + merged="Removed" + break + + elif alnLen >= len2-10: + #print("Contig %s is fully part of contig %s and will be removed!!!"%(cID2, cID1)) + if s1_start < s1_end: + if s2_start < s2_end: + nTar0=cID1[:-2]+"_0" + aDist0=s1_start + nTar1=cID1[:-2]+"_1" + aDist1=len1-s1_end + elif s2_start > s2_end: + nTar0=cID1[:-2]+"_1" + aDist0=len1-s1_end + nTar1=cID1[:-2]+"_0" + aDist1=s1_start + elif s1_start > s1_end: + if s2_start < s2_end: + nTar0=cID1[:-2]+"_1" + aDist0=len1-s1_end + nTar1=cID1[:-2]+"_0" + aDist1=s1_start + elif s2_start > s2_end: + nTar0=cID1[:-2]+"_0" + aDist0=s1_start + nTar1=cID1[:-2]+"_1" + aDist1=len1-s1_end + + #removeMerged(globalVars.contigGR, cID2[:-2]+"_0", nTar0, aDist0) + #removeMerged(globalVars.contigGR, cID2[:-2]+"_1", nTar1, aDist1) + mergeAdjust=(cID2, nTar0, aDist0, nTar1, aDist1) + merged="Removed" + break + + + # get orientations of contigs meaning which ends are overlapping + ori1 = cID1[-1] + ori2 = cID2[-1] + + # case1 c1_1 --> c2_0 + if ori1 == "1" and ori2 == "0": + # example: c1:ctg7180000370328_1 len=1919 ostart=1418 oend=1918 + # example: c2:ctg7180000371159_0 len=5383 ostart=0 oend=500 + # to make sure alns are at the borders + if s1_end + 10 >= len1 - 1 and s2_start - 10 <= 0: + newSeqID = cID1 + "__" + cID2 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case2 c1_1 --> c2_1 + elif ori1 == "1" and ori2 == "1": + #print("MERGE CASE 1-1") + # example: c1:ctg7180000370630_1 len=5524 ostart=5523 oend=5458 + # example: c2:jtg7180000371240f_7180000371241f_1 len=12170 + # ostart=12104 oend=12169 + if s1_start + 10 >= len1 - 1 and s2_end + 10 >= len2 - 1: + newSeqID = cID1 + "__" + cID2 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case3 c1_0 --> c2_1 + elif ori1 == "0" and ori2 == "1": + # example: c1:ctg7180000370268_0 len=3867 ostart=0 oend=419 + # example: c2:ctg7180000370715_1 len=8150 ostart=7730 oend=8149 + if s2_end + 10 >= len2 - 1 and s1_start - 10 <= 0: + newSeqID = cID2 + "__" + cID1 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + + # case4 c1_0 --> c2_0 + elif ori1 == "0" and ori2 == "0": + # example: c1:ctg7180000370227_0 len=1568 ostart=415 oend=0 + # example: c2:ctg7180000371034_0 len=1684 ostart=0 oend=415 + if s1_end - 10 <= 0 and s2_start - 10 <= 0: + newSeqID = cID2 + "__" + cID1 + saveMerge(newSeqID, cID1, cID2, alnLen) + #print("\tGOOD MERGING POSSIBLE!!!") + merged = True + else: + #print("ALNS NOT AT THE BORDERS!!!") + merged = False + #else: + #print("WHATDE WEIRD CONTIG ENDS!!!") + + if merged == True: + #print("\tMERGING SUCESSFULL!!!") + #print(newSeqID) + #if merging sucessful no reason to check more lines break out of loop ignore last alns + break + #else: + #print("\tMERGING FAILED!!!") + else: + #print("\tMERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + merged = False + + blastReader.close() + return merged, alnLen*-1, mergeAdjust + + +def saveMerge(nID, oID1, oID2, olap): + if oID1<=oID2: + combi=oID1+"__"+oID2 + else: + combi=oID2+"__"+oID1 + + globalVars.mergedDICT[combi]=(nID, olap) + + + +# function to properly remove key from a dictionary +def removeMerged(rdict, rkey, newTar, addDist): + print("Tyring to remove %s from dict"%(rkey)) + try: + print("Removing %s from dict"%(rkey)) + #part to adjust existing edge + existingEdge=rdict[rkey].active + globalVars.adjustDICT[rkey]=(newTar, addDist) + if existingEdge!="": + #print("existing Edge") + #print(existingEdge) + existingTarget=existingEdge.target + rdict[existingTarget].active="" + #print("Update edge with newTarget %s and added dist +%i"%(newTar, addDist)) + adjustedEdge=existingEdge + adjustedEdge.target=newTar + adjustedEdge.dist-=addDist # must be sustracted to properly adjust + #print("adjusted Edge") + #print(adjustedEdge) + #print("Rerun checkEdge fit with adjusted edge!") + #gpma.checkEdgeFit(adjustedEdge, rdict) + #gpma.gpma() + #remove contig from hash + del rdict[rkey] + + + + except KeyError: + print("keyError while removeing") + pass + + +def fillGAPS(cID1, cID2, cLen1, cLen2, cSeq1, cSeq2, rID, rLen, rSeq): + + tmpDIR = globalVars.outDir + oWriter = open(tmpDIR + "/tmp_cont1.fa", "w") + oWriter.write(">%s\n%s\n" % (cID1, cSeq1)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp_cont2.fa", "w") + oWriter.write(">%s\n%s\n" % (cID2, cSeq2)) + oWriter.close() + oWriter = open(tmpDIR + "/tmp_read.fa", "w") + oWriter.write(">%s\n%s\n" % (rID, rSeq)) + oWriter.close() + + # define blastn output columns + outfmt = "'6 qseqid sseqid pident length qstart qend sstart send sstrand evalue bitscore'" + #no threads --> 'num_threads' is currently ignored when 'subject' is specified. + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp_cont1.fa", tmpDIR + "/tmp_read.fa", tmpDIR + "/tmp1.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + subprocess.call("blastn -word_size 15 -outfmt %s -subject %s -query %s -out %s" % (outfmt, tmpDIR + "/tmp_cont2.fa", tmpDIR + "/tmp_read.fa", tmpDIR + "/tmp2.blast6"), + shell=True, stdout=DEVNULL, stderr=subprocess.STDOUT) + + ori1=cID1[-1] + ori2=cID2[-1] + + + #print("TRYING TO MERGE C1:%s AND READ:%s..."%(cID1, rID)) + blastReader = open(tmpDIR + "/tmp1.blast6") + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO C1 MERGING CAN BE DONE!!!!!!!") + merge1 = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + merge1=False + #print(line) + splitted = line.strip().split("\t") + rID = splitted[0] + cID = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + r_start = int(splitted[4]) - 1 + r_end = int(splitted[5]) - 1 + c_start = int(splitted[6]) - 1 + c_end = int(splitted[7]) - 1 + c_strand1 = splitted[8] + + #print("\tcontig:%s len=%i ostart=%i oend=%i cstrand=%s" % + # (cID1, cLen1, c_start, c_end, c_strand1)) + #print("\tread:%s len=%i ostart=%i oend=%i" % + # (rID, rLen, r_start, r_end)) + + if alnLen >= 0.3*globalVars.splitLen and ident >= 80: + end=max(c_start, c_end) + if end + 20 >= cLen1: + merge1=True + #print("\tGOOD C1 MERGING POSSIBLE FROM START!!!!!!!") + #get read starting position for concatenation + if c_strand1=="minus": + readStart=r_start + else: + readStart=r_end + #print(line) + break + + #else: + # print("\tERROR C1 MERGING FAILED ALN NOT AT BORDERS OF CONT!!!!!!!") + #else: + # print("\tC1 MERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + + # no need to test to merge C2 with read if C1 merge failed + if merge1==False: + #print("GAP FILLING FAILED!!!!!") + return False + + + blastReader.close() + + #print("TRYING TO MERGE C2:%s AND READ:%s..."%(cID2, rID)) + blastReader = open(tmpDIR + "/tmp2.blast6") + alllines = blastReader.readlines() + if len(alllines) == 0: + #print("\tERROR no hit found - NO C2 MERGING CAN BE DONE!!!!!!!") + merge2 = False + else: + #print("Number of lines in blast comp=%i" %(len(alllines))) + for line in alllines: + merge2=False + #print(line) + splitted = line.strip().split("\t") + # s2 first beacause s1 was used as reference + rID = splitted[0] + cID = splitted[1] + ident = float(splitted[2]) + alnLen = int(splitted[3]) + # IMPORTANT BLAST INDICES START WITH 1 NOT WITH 0 --> -1 for all of + r_start = int(splitted[4]) - 1 + r_end = int(splitted[5]) - 1 + c_start = int(splitted[6]) - 1 + c_end = int(splitted[7]) - 1 + c_strand2 = splitted[8] + + #print("\tcontig:%s len=%i ostart=%i oend=%i cstrand=%s" % + # (cID2, cLen2, c_start, c_end, c_strand2)) + #print("\tread:%s len=%i ostart=%i oend=%i" % + # (rID, rLen, r_start, r_end)) + + if alnLen >= 0.3*globalVars.splitLen and ident >= 80: + start=min(c_start, c_end) + if start-20 <= 0: + merge2=True + #print("\tGOOD C2 MERGING POSSIBLE FROM START!!!!!!!") + #get read starting position for concatenation + if c_strand2=="minus": + readEnd=r_end + else: + readEnd=r_start + #print(line) + break + #else: + # print("\tERROR C2 MERGING FAILED ALN NOT AT BORDERS OF CONT!!!!!!!") + #else: + # print("\tC2 MERGING FAILED EITHER TOO LITTLE OVERLAP(%i) OR IDENTITY(%f) TOO LOW" % ( + # alnLen, ident)) + + blastReader.close() + + + if merge1==True and merge2==True: + if c_strand1 != c_strand1: + #print("Gap filling failed hits were in different directions") + return False + #print("FILLING GAP IS POSSIBLE!!!!!") + #print("rStart:%i rEnd%i"%(readStart, readEnd)) + #newID=cID1+"_"+rID+"_"+cID2 + #newSeq=cSeq1+rSeq[readStart:readEnd]+cSeq2 + if readStart<=readEnd: + return rSeq[readStart:readEnd] + else: + return reverseComplement(rSeq[readEnd:readStart]) + else: + #print("GAP FILLING FAILED!!!!!") + return False + diff --git a/minimap2 b/minimap2 new file mode 160000 index 0000000..d90583b --- /dev/null +++ b/minimap2 @@ -0,0 +1 @@ +Subproject commit d90583b83cd81abd652abb025f39b8884937d59c diff --git a/splitReads.py b/splitReads.py new file mode 100644 index 0000000..fe57fa2 --- /dev/null +++ b/splitReads.py @@ -0,0 +1,131 @@ +import globalVars + +class myFQ: + + def __init__(self, iD, seq, opt, qual): + self.iD = iD + self.seq = seq + self.opt = opt + self.qual = qual + + +class myFA: + + def __init__(self, iD, seq): + self.iD = iD + self.seq = seq + +def splitReadsFQ(inFQ, oFQ, splitSize): + + if inFQ.endswith(".gz"): + import gzip + fqParser = gzip.open(inFQ, mode='rt') + else: + fqParser = open(inFQ) + + myWriter = open(oFQ, "w") + + lineCount = 0 + seqCount = 0 + writtenREAD_ID = 1 + for line in fqParser: + lineCount += 1 + + if lineCount == 1: + rid=line[1:].rstrip().split("\t")[0].split(" ")[0] + curRead = myFQ(line.strip(), "", "", "") + elif lineCount == 2: + curRead.seq = line.strip() + elif lineCount == 3: + curRead.opt = line.strip() + elif lineCount == 4: + curRead.qual = line.strip() + + curStart = 0 + curEnd = splitSize + splitCount = 0 + + while curEnd < len(curRead.seq): + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + #splitID=str(rid) + "_" + str(splitCount) + myWriter.write("@" + splitID + "\n") + myWriter.write(str(curRead.seq[curStart:curEnd]) + "\n") + myWriter.write(str(curRead.opt) + "\n") + myWriter.write(str(curRead.qual[curStart:curEnd]) + "\n") + + splitCount += 1 + curStart += splitSize + curEnd += splitSize + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + #splitID=str(rid) + "_" + str(splitCount) + myWriter.write("@" + splitID + "\n") + myWriter.write(str(curRead.seq[curStart:]) + "\n") + myWriter.write(str(curRead.opt) + "\n") + myWriter.write(str(curRead.qual[curStart:]) + "\n") + + globalVars.readHash[str(writtenREAD_ID)]=curRead.seq + #globalVars.readHash[str(rid)]=curRead.seq + + writtenREAD_ID += 1 + seqCount += 1 + lineCount = 0 + + myWriter.close() + fqParser.close() + + +def splitReadsFA(inFA, oFA, splitSize): + + if inFA.endswith(".gz"): + import gzip + faParser = gzip.open(inFA) + else: + faParser = open(inFA) + + myWriter = open(oFA, "w") + + curID="" + seqCount = 0 + writtenREAD_ID = 1 + for line in faParser: + + if line.startswith(">"): + + if curID!="": + + curStart = 0 + curEnd = splitSize + splitCount = 0 + + while curEnd < len(curRead.seq): + + splitID=str(writtenREAD_ID) + "_" + str(splitCount) + curSeq=str(curRead.seq[curStart:curEnd]) + myWriter.write(">" + splitID + "\n") + myIndex=0 + # fasta format capped at 80bps per line + while (myIndex + 80) <= len(curSeq): + myWriter.write("%s\n" % (curSeq[myIndex:myIndex + 80])) + myIndex += 80 + myWriter.write("%s\n" % (curSeq[myIndex:])) + + splitCount += 1 + curStart += splitSize + curEnd += splitSize + + globalVars.readHash[str(writtenREAD_ID)]=curRead.seq + + writtenREAD_ID += 1 + seqCount += 1 + + curID=line.strip() + curRead = myFA(curID, "") + + else: + curRead.seq+=line.strip() + + + myWriter.close() + faParser.close() -- GitLab