Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import pandas as pd
FatherID=sys.argv[1]
MotherID=sys.argv[2]
Offspring=sys.argv[3]
vcf_file = sys.argv[4]
print ("##################################################################################")
print ("Trio:", FatherID, MotherID,Offspring)
# load the vcf file
header_vcf = []
records = []
with open(vcf_file, 'r') as vcf_:
for line in vcf_:
if line.startswith('##'):
header_vcf.append(line)
continue
if line.startswith('#C'):
keys = line.split('\t')
keys[0] = keys[0][1:]
keys[-1] = keys[-1][:-1]
records.append(line[:-1].split('\t'))
headervcf = "".join(header_vcf)
df = pd.DataFrame(records, columns=keys).iloc[1:,:]
# select only the columns with father id, mother id and offspring id
data = df[["CHROM","POS",FatherID, MotherID, Offspring]].copy()
# keep only the first 3 characters of the genotype
data[FatherID] = data[FatherID].apply(lambda x: x[:3])
data[MotherID] = data[MotherID].apply(lambda x: x[:3])
data[Offspring] = data[Offspring].apply(lambda x: x[:3])
# function to classify the genotype of the offspring
def gen(father,mother,offspring):
if father=="./." or mother =="./." or offspring =="./.":
return "Missing data" #any ./.
elif (offspring[0]==father[0] and offspring[-1]==mother[0]) or \
(offspring[0]==father[0] and offspring[-1]==mother[-1]) or \
(offspring[0]==father[-1] and offspring[-1]==mother[0]) or \
(offspring[0]==father[-1] and offspring[-1]==mother[-1]) or \
(offspring[0]==mother[0] and offspring[-1]==father[0]) or \
(offspring[0]==mother[0] and offspring[-1]==father[-1]) or \
(offspring[0]==mother[-1] and offspring[-1]==father[0]) or \
(offspring[0]==mother[-1] and offspring[-1]==father[-1]):
return "CORRECT"
else: # offspring not a combination of parent's genotypes
return "INCORRECT"
# create a new column with the classification of the genotype of the offspring
classification = [gen(data.loc[i,FatherID],data.loc[i,MotherID],data.loc[i,Offspring]) for i in range(1,len(data)+1)]
df["classification"] = classification
trio = df[["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",FatherID, MotherID, Offspring, "classification"]]
print(pd.DataFrame(trio.classification.value_counts()))
print("----------------------------------------------------------------------------------")
print("--------------------Correct/incorrect rate (except missing)-----------------------")
df1 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts(normalize=True))
df2 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts())
concat = pd.concat([df1, df2], axis=1, join='inner')
concat.columns=["GA rate", "GA absolute"]
print(concat)
# create filtered VCF
select = trio[trio.classification=="CORRECT"].drop(columns="classification").rename(columns={"CHROM":"#CHROM"})
with open (f"{FatherID}_{MotherID}_{Offspring}.vcf", "a") as f:
f.write(headervcf)
select.to_csv(f, sep='\t', index=False)