Skip to content
Snippets Groups Projects
Commit 3e1f9f19 authored by larissasa's avatar larissasa
Browse files

Upload New File

parent f9e21be3
No related branches found
No related tags found
No related merge requests found
import sys
import pandas as pd
FatherID=sys.argv[1]
MotherID=sys.argv[2]
Offspring=sys.argv[3]
vcf_file = sys.argv[4]
print ("##################################################################################")
print ("Trio:", FatherID, MotherID,Offspring)
# load the vcf file
header_vcf = []
records = []
with open(vcf_file, 'r') as vcf_:
for line in vcf_:
if line.startswith('##'):
header_vcf.append(line)
continue
if line.startswith('#C'):
keys = line.split('\t')
keys[0] = keys[0][1:]
keys[-1] = keys[-1][:-1]
records.append(line[:-1].split('\t'))
headervcf = "".join(header_vcf)
df = pd.DataFrame(records, columns=keys).iloc[1:,:]
# select only the columns with father id, mother id and offspring id
data = df[["CHROM","POS",FatherID, MotherID, Offspring]].copy()
# keep only the first 3 characters of the genotype
data[FatherID] = data[FatherID].apply(lambda x: x[:3])
data[MotherID] = data[MotherID].apply(lambda x: x[:3])
data[Offspring] = data[Offspring].apply(lambda x: x[:3])
# function to classify the genotype of the offspring
def gen(father,mother,offspring):
if father=="./." or mother =="./." or offspring =="./.":
return "Missing data" #any ./.
elif (offspring[0]==father[0] and offspring[-1]==mother[0]) or \
(offspring[0]==father[0] and offspring[-1]==mother[-1]) or \
(offspring[0]==father[-1] and offspring[-1]==mother[0]) or \
(offspring[0]==father[-1] and offspring[-1]==mother[-1]) or \
(offspring[0]==mother[0] and offspring[-1]==father[0]) or \
(offspring[0]==mother[0] and offspring[-1]==father[-1]) or \
(offspring[0]==mother[-1] and offspring[-1]==father[0]) or \
(offspring[0]==mother[-1] and offspring[-1]==father[-1]):
return "CORRECT"
else: # offspring not a combination of parent's genotypes
return "INCORRECT"
# create a new column with the classification of the genotype of the offspring
classification = [gen(data.loc[i,FatherID],data.loc[i,MotherID],data.loc[i,Offspring]) for i in range(1,len(data)+1)]
df["classification"] = classification
trio = df[["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",FatherID, MotherID, Offspring, "classification"]]
print(pd.DataFrame(trio.classification.value_counts()))
print("----------------------------------------------------------------------------------")
print("--------------------Correct/incorrect rate (except missing)-----------------------")
df1 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts(normalize=True))
df2 = pd.DataFrame(trio[trio.classification!="Missing data"].classification.value_counts())
concat = pd.concat([df1, df2], axis=1, join='inner')
concat.columns=["GA rate", "GA absolute"]
print(concat)
# create filtered VCF
select = trio[trio.classification=="CORRECT"].drop(columns="classification").rename(columns={"CHROM":"#CHROM"})
with open (f"{FatherID}_{MotherID}_{Offspring}.vcf", "a") as f:
f.write(headervcf)
select.to_csv(f, sep='\t', index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment