diff --git a/checkHitlist.py b/checkHitlist.py index 25221eeb10cb5ce2649c66c7fa6d5304be69a448..16683cf9388f687a7b9ceca17ee7d755eb9433c9 100644 --- a/checkHitlist.py +++ b/checkHitlist.py @@ -8,7 +8,7 @@ def checkINSDELS(hit1, hit2): #print("check INS/DELS") hdist = ((hit2.split - hit1.split) - 1) * hit1.q_len - if hit1.strand == "+" and hit2.strand == "+": + if hit1.strand == "+" and hit2.strand == "+": #both map on fwd strand h1_end = min(hit1.t_end + (hit1.q_len - hit1.q_end), hit1.t_len-1) h2_start = max(hit2.t_start - hit2.q_start, 0) dist = h2_start - h1_end @@ -18,7 +18,7 @@ def checkINSDELS(hit1, hit2): cstart=hit1.t_start cend=hit2.t_end - elif hit1.strand == "-" and hit2.strand == "-": + elif hit1.strand == "-" and hit2.strand == "-": #both map on rev strand h1_end = min(hit1.t_start - (hit1.q_len - hit1.q_end), hit1.t_len-1) h2_start = max(hit2.t_end + hit2.q_start, 0) dist = h1_end - h2_start @@ -27,8 +27,8 @@ def checkINSDELS(hit1, hit2): eStart=hit2.t_end-1 cstart=hit1.t_end cend=hit2.t_start - - if diffDIST >= globalVars.minSVL or diffDIST <= -1*globalVars.minSVL: + #if dist error larger(pos or neg) than min SV size + if diffDIST >= globalVars.minSVL or diffDIST <= -1*globalVars.minSVL: #print("Error detected") #print(hit1) #print(hit2) @@ -47,7 +47,7 @@ def checkINSDELS(hit1, hit2): return curError, curCorrect -# identify inversions +# fucntion to identify inversions e.g. compare hits in different orientations on same contig def checkINVS(hit1, hit2, oriHASH): #print("check INVS") # to store the hits/fragments of inversion --> start and end @@ -63,6 +63,8 @@ def checkINVS(hit1, hit2, oriHASH): return curError +# function to calculate the distance between to contigs +# if both splits map on rev strand def calcHitContig_Dist_rev(hit1, hit2): err1=False err2=False @@ -85,17 +87,7 @@ def calcHitContig_Dist_rev(hit1, hit2): #print(h2_start, hdist) #print(hit2) err2=True - ''' - if cdist<-1*globalVars.splitLen: - if hit1.t_start > globalVars.splitLen: - print("add error %s for cont1"%(hit1.target)) - print(hit1) - err1=True - if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: - print("add error %s for cont2"%(hit2.target)) - print(hit2) - err2=True - ''' + #print("RevRev") #print(err1, err2) #print(hit1) @@ -105,7 +97,8 @@ def calcHitContig_Dist_rev(hit1, hit2): #print("\t%i\n"%(cdist)) return cdist, err1, err2 - +# function to calculate the distance between to contigs +# if both splits map on fwd strand def calcHitContig_Dist_pos(hit1, hit2): err1=False err2=False @@ -138,7 +131,8 @@ def calcHitContig_Dist_pos(hit1, hit2): #print("\t%i\n"%(cdist)) return cdist, err1, err2 - +# function to calculate the distance between to contigs +# if hit1 on fwd strand and hit2 on rev strand def calcHitContig_Distance_posrev(hit1, hit2): err1=False err2=False @@ -161,17 +155,7 @@ def calcHitContig_Distance_posrev(hit1, hit2): #print(h2_start, hdist) #print(hit2) err2=True - ''' - if cdist<-1*globalVars.splitLen: - if globalVars.contLenDICT[hit1.target]-hit1.t_end > globalVars.splitLen: - print("add error %s for cont1"%(hit1.target)) - print(hit1) - err1=True - if globalVars.contLenDICT[hit2.target]-hit2.t_end > globalVars.splitLen: - print("add error %s for cont2"%(hit2.target)) - print(hit2) - err2=True - ''' + #print("PosRev") #print(err1, err2) #print(hit1) @@ -181,7 +165,8 @@ def calcHitContig_Distance_posrev(hit1, hit2): #print("\t%i\n"%(cdist)) return cdist, err1, err2 - +# function to calculate the distance between to contigs +# if hit1 on rev strand and hit2 on fwd strand def calcHitContig_Distance_revpos(hit1, hit2): err1=False err2=False @@ -204,17 +189,6 @@ def calcHitContig_Distance_revpos(hit1, hit2): #print(h2_start, hdist) #print(hit2) err2=True - ''' - if cdist<-1*globalVars.splitLen: - if hit1.t_start > globalVars.splitLen: - print("add error %s for cont1"%(hit1.target)) - print(hit1) - err1=True - if hit2.t_start > globalVars.splitLen: - print("add error %s for cont2"%(hit2.target)) - print(hit2) - err2=True - ''' #print("RevPos") #print(err1, err2) @@ -331,13 +305,7 @@ def checkHitlist(hitlist, edgeGRAPH, oriHASH): # compare 2 hits following each other for curHits in hitlist: - #if curHit.target==globalVars.largestContig[0]: - #print("hit on largest contig found --> add coverage") - # addLongestCOV(curHit) - #if len(curHits)>1: - # for curHit in curHits: - # globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) - + #if len(curHits) > 1 the hit maps repetitive ignore it if len(curHits)==1: curHit=curHits[0] #globalVars.goodE[curHit.target].append((curHit.target, min(curHit.t_start, curHit.t_end), max(curHit.t_start, curHit.t_end))) diff --git a/gpma.py b/gpma.py index e964d82282198b72ae3935e01d7c2af0e1a8ba76..8af0877e00b194f42108b738b9f5ba4681f4774b 100644 --- a/gpma.py +++ b/gpma.py @@ -11,8 +11,6 @@ import copy # function to set edges in graph def setEdges(setEdge, agraph): - - if setEdge.direct!="": newEdgeStart = globalVars.myEdge(setEdge.start, setEdge.target, setEdge.weight, setEdge.dist, setEdge.stdev, setEdge.direct) @@ -39,6 +37,7 @@ def setEdges(setEdge, agraph): #print(oldEdge[1]) return oldEdge +# function to reset an old edge def resetEdge(setEdge, agraph): #print("resetting edge") #print(setEdge) @@ -85,6 +84,8 @@ def removeMergedNEW(rkey, newTar, addDist): #print("Now pointing to %s with dist %i"%(newTar, addDist)) globalVars.adjustDICT[rkey]=(newTar, addDist) +# function to check for check for merging and calls the merging functions +# from merging.py def checkMerge(curEdge, cHASH): #print(curEdge.dist, globalVars.minOverlap*-1) #print(curEdge.dist <= globalVars.minOverlap*-1) @@ -107,6 +108,7 @@ def checkMerge(curEdge, cHASH): return merged, mAdjust +# function to merge paths as defined in the GPMA paper def mergeNEW(nEdge, inGraph): graph=inGraph.cGraph sucess=True @@ -304,7 +306,9 @@ def getNodePath(ggraph, anode): return ePath, pHash - +# given list of edges this function returns the happy and +# unhappiness for all edges with both nodes in the path +# in regard to the currently selected path def checkHappiness(pdict, cDICT): nH=0 nUH=0 @@ -349,8 +353,7 @@ def checkHappiness(pdict, cDICT): else: #print("Incorrect orientation increasing UNhappy by %i"%(curEdge[0])) nUH+=curEdge[0] - continue - + continue return nH, nUH @@ -433,7 +436,7 @@ def gpma(newEdge, inGraph): #check happiness if newHP - (nHP1+nHP2) >= newUHP - (nUHP1+nUHP2): #print("GPMA VERY HAPPY") - for remove in removed: + for remove in removed: removeMergedNEW(remove[0][:-2]+"_0", remove[1], remove[2]) removeMergedNEW(remove[0][:-2]+"_1", remove[3], remove[4]) else: @@ -444,11 +447,8 @@ def gpma(newEdge, inGraph): #print("Happiness p1: %i/%i(H/UH)"%(nHP1, nUHP1)) #[print(e) for e in ePath2] #print("Happiness p2: %i/%i(H/UH)"%(nHP2, nUHP2)) - for edge in oldEdges: + for edge in oldEdges: # if GPMA unhappy reset old graph resetEdge(edge, inGraph) - - - #else: # print("GPMA FAILED!!!") diff --git a/merging.py b/merging.py index ec236e706e8ba77924c317745bcd5863be570cac..1282a43e2dc15f4e18fedd14f48af4f0fbbbe894 100644 --- a/merging.py +++ b/merging.py @@ -36,38 +36,6 @@ def runBLAST(id1, id2, eOL, checkOL): elif id1.endswith("0"): seq1_trimmed=seq1[(eOL*-1)-len(seq2) : eOL*-1] - - - ''' - if checkOL==True: - #only blast overlaps against each other - #if contig longer than overlap - if len(seq1) > estOL*(-1)+500: - if id1.endswith("_1"): - seq1_trimmed=seq1[int(len(seq1)+(estOL-500)):] - elif id1.endswith("_0"): - seq1_trimmed=seq1[:int(estOL*(-1)+500)] - else: - print("WTF is this name - %s"%(id1)) - else: - seq1_trimmed=seq1 - checkOL=False - - if len(seq2) > estOL*(-1)+500: - if id2.endswith("_1"): - seq2_trimmed=seq2[int(len(seq2)+(estOL-500)):] - elif id2.endswith("_0"): - seq2_trimmed=seq2[:int(estOL*(-1)+500)] - else: - print("WTF is this name - %s"%(id2)) - else: - seq2_trimmed=seq2 - checkOL=False - else: - seq1_trimmed=seq1 - seq2_trimmed=seq2 - ''' - tmpDIR = globalVars.outDir #print(len(seq1), len(seq1_trimmed)) @@ -281,7 +249,7 @@ def tryMERGE(blastPATH, cID1, cID2, len1, len2): blastReader.close() return merged, alnLen*-1, mergeAdjust - +# function to save the coordinates of the exact overlap a contig def saveMerge(nID, oID1, oID2, olap): if oID1<=oID2: combi=oID1+"__"+oID2 diff --git a/splitReads.py b/splitReads.py index fe57fa2694e49605a0406ef791c4cae459391125..15c555a0a13f4362571ddb7b16e8f03ec6f18d42 100644 --- a/splitReads.py +++ b/splitReads.py @@ -1,20 +1,22 @@ import globalVars -class myFQ: +# small class defining the content of fastq file +class myFQ: def __init__(self, iD, seq, opt, qual): self.iD = iD self.seq = seq self.opt = opt self.qual = qual - +# small class defining the content of fasta file class myFA: - def __init__(self, iD, seq): self.iD = iD self.seq = seq + +#function to split reads in fastq format def splitReadsFQ(inFQ, oFQ, splitSize): if inFQ.endswith(".gz"): @@ -75,7 +77,7 @@ def splitReadsFQ(inFQ, oFQ, splitSize): myWriter.close() fqParser.close() - +#function to split reads in fasta format def splitReadsFA(inFA, oFA, splitSize): if inFA.endswith(".gz"):