GenotypeAccuracyEstimation.py

import sys
import pandas as pd


FatherID=sys.argv[1]
MotherID=sys.argv[2]
Offspring=sys.argv[3]
vcf_file = sys.argv[4]

print ("##################################################################################")
print ("Trio:", FatherID, MotherID,Offspring)

# load the vcf file
header_vcf = []
records = []
with open(vcf_file, 'r') as vcf_:
    for line in vcf_:
        if line.startswith('##'):
            header_vcf.append(line)
            continue
        if line.startswith('#C'):
            keys = line.split('\t')
            keys[0] = keys[0][1:]
            keys[-1] = keys[-1][:-1]
        records.append(line[:-1].split('\t'))

headervcf = "".join(header_vcf)

df = pd.DataFrame(records, columns=keys).iloc[1:,:]

# select only the columns with father id, mother id and offspring id
data = df[["CHROM","POS",FatherID, MotherID, Offspring]].copy()

# keep only the first 3 characters of the genotype
data[FatherID] = data[FatherID].apply(lambda x: x[:3])
data[MotherID] = data[MotherID].apply(lambda x: x[:3])
data[Offspring] = data[Offspring].apply(lambda x: x[:3])


# function to classify the genotype of the offspring
def gen(father,mother,offspring):
    if father=="./." or mother =="./." or offspring =="./.":
        return "Missing data" #any ./.
    elif (offspring[0]==father[0] and offspring[-1]==mother[0]) or \
        (offspring[0]==father[0] and offspring[-1]==mother[-1]) or \
        (offspring[0]==father[-1] and offspring[-1]==mother[0]) or \
        (offspring[0]==father[-1] and offspring[-1]==mother[-1]) or \
        (offspring[0]==mother[0] and offspring[-1]==father[0]) or \
        (offspring[0]==mother[0] and offspring[-1]==father[-1]) or \
        (offspring[0]==mother[-1] and offspring[-1]==father[0]) or \
        (offspring[0]==mother[-1] and offspring[-1]==father[-1]):
        return "CORRECT"
    else: # offspring not a combination of parent's genotypes
        return  "INCORRECT"

# create a new column with the classification of the genotype of the offspring
classification = [gen(data.loc[i,FatherID],data.loc[i,MotherID],data.loc[i,Offspring]) for i in range(1,len(data)+1)]
df["classification"] = classification
trio = df[["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",FatherID, MotherID, Offspring, "classification"]]
print(pd.DataFrame(trio.classification.value_counts()))

print("----------------------------------------------------------------------------------")
print("--------------------Correct/incorrect rate (except missing)-----------------------")


df1 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts(normalize=True))
df2 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts())
concat = pd.concat([df1, df2], axis=1, join='inner')
concat.columns=["GA rate", "GA absolute"]
print(concat)

# create filtered VCF
select = trio[trio.classification=="CORRECT"].drop(columns="classification").rename(columns={"CHROM":"#CHROM"})

with open (f"{FatherID}_{MotherID}_{Offspring}.vcf", "a") as f:
    f.write(headervcf)
    select.to_csv(f, sep='\t', index=False)