import sys import pandas as pd FatherID=sys.argv[1] MotherID=sys.argv[2] Offspring=sys.argv[3] vcf_file = sys.argv[4] print ("##################################################################################") print ("Trio:", FatherID, MotherID,Offspring) # load the vcf file header_vcf = [] records = [] with open(vcf_file, 'r') as vcf_: for line in vcf_: if line.startswith('##'): header_vcf.append(line) continue if line.startswith('#C'): keys = line.split('\t') keys[0] = keys[0][1:] keys[-1] = keys[-1][:-1] records.append(line[:-1].split('\t')) headervcf = "".join(header_vcf) df = pd.DataFrame(records, columns=keys).iloc[1:,:] # select only the columns with father id, mother id and offspring id data = df[["CHROM","POS",FatherID, MotherID, Offspring]].copy() # keep only the first 3 characters of the genotype data[FatherID] = data[FatherID].apply(lambda x: x[:3]) data[MotherID] = data[MotherID].apply(lambda x: x[:3]) data[Offspring] = data[Offspring].apply(lambda x: x[:3]) # function to classify the genotype of the offspring def gen(father,mother,offspring): if father=="./." or mother =="./." or offspring =="./.": return "Missing data" #any ./. elif (offspring[0]==father[0] and offspring[-1]==mother[0]) or \ (offspring[0]==father[0] and offspring[-1]==mother[-1]) or \ (offspring[0]==father[-1] and offspring[-1]==mother[0]) or \ (offspring[0]==father[-1] and offspring[-1]==mother[-1]) or \ (offspring[0]==mother[0] and offspring[-1]==father[0]) or \ (offspring[0]==mother[0] and offspring[-1]==father[-1]) or \ (offspring[0]==mother[-1] and offspring[-1]==father[0]) or \ (offspring[0]==mother[-1] and offspring[-1]==father[-1]): return "CORRECT" else: # offspring not a combination of parent's genotypes return "INCORRECT" # create a new column with the classification of the genotype of the offspring classification = [gen(data.loc[i,FatherID],data.loc[i,MotherID],data.loc[i,Offspring]) for i in range(1,len(data)+1)] df["classification"] = classification trio = df[["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",FatherID, MotherID, Offspring, "classification"]] print(pd.DataFrame(trio.classification.value_counts())) print("----------------------------------------------------------------------------------") print("--------------------Correct/incorrect rate (except missing)-----------------------") df1 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts(normalize=True)) df2 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts()) concat = pd.concat([df1, df2], axis=1, join='inner') concat.columns=["GA rate", "GA absolute"] print(concat) # create filtered VCF select = trio[trio.classification=="CORRECT"].drop(columns="classification").rename(columns={"CHROM":"#CHROM"}) with open (f"{FatherID}_{MotherID}_{Offspring}.vcf", "a") as f: f.write(headervcf) select.to_csv(f, sep='\t', index=False)