update readme + add comment for snakemake

1aaa5f3c · pipaj97 · 1c5a2c79 · 1aaa5f3c · 1aaa5f3c · 1aaa5f3c
Commit 1aaa5f3c authored 2 years ago by pipaj97
--- a/README.md
+++ b/README.md
@@ -3,24 +3,43 @@


 ## Setup
-clone the covsonar repository on a paralell level to this folder, so that the parent folder contains the covid_project3 folder and the covsonar repository.
+Clone the covsonar repository on a paralell level to this folder, so that the parent folder contains the covid_project3 folder and the covsonar repository.
+The covsonar repository can be found here [https://github.com/rki-mf1/covsonar](https://github.com/rki-mf1/covsonar) and can be cloned by using the following command.
+
+```
+git clone https://github.com/rki-mf1/covsonar.git
+```
+
+Additionally to the covsonar repository, the escape scores from the Bloom lab are required. They can be found here: [https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/processed_data/escape_data_mutation.csv](https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/processed_data/escape_data_mutation.csv)
+During development, the whole repository was also cloned on a paralell level to this folder.
+It can be necessary to adjust the paths in the config.yaml, if the folder are somewhere else.

 The pipeline was tested by using the docker container build by the dockerfile in this folder.

+```
 docker build -t pipaj97/sars_cov_2_project3 . 
+```

-After pulling the container it should be run with mounting the parent folder of the covsonar repository 
-and the project folder. 
+After building the container it should be run with mounting the parent folder of the covsonar repository 
+and the project folder. It should be noted, that the escape_data_mutation.csv also needs to be mounted.

+```
 docker run --rm -it --mount src="$(pwd)",target=/project_container,type=bind pipaj97/sars_cov_2_project3
+```

-switch to the right directory 
+Switch to the right directory.
+```
 cd project_container/covid_project3/
+```

-activate snakemake env
+Activate the snakemake environment.
+```
 conda activate snakemake
+```

-run the snakemake workflow
+Run the snakemake workflow.
+```
 snakemake --use-conda -c
+```

-
+In the config file, parameters for the risk score computations can be adjusted. The results in the repository are created using the predefined parameter values in the config.yaml
\ No newline at end of file
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,13 +7,19 @@ result_dir: "results"
 # when following the command provided in the readme, this path will work already.
 # the same applies for the risk scores. They can be found here: https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/processed_data/escape_data_mutation.csv
 # (git clone https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps.git)
-covsonar_repo: "/project_container/covsonar"
-escape_scores: "/project_container/SARS2_RBD_Ab_escape_maps/processed_data/escape_data_mutation.csv"
+covsonar_repo: "/Users/julian/PythonProjects/master/covid/covsonar"
+escape_scores: "/Users/julian/PythonProjects/master/covid/SARS2_RBD_Ab_escape_maps/processed_data/escape_data_mutation.csv"

 # B.1.1.7, B.1.351, P.1, B.1.617.2, B.1.1.529, from pangolin (accessed 28 September 2022)   
 # 'BA.2', 'BA.4', 'BA.5' #from ECDC (SARS-CoV-2 variants of concern as of 22 September 2022)
 risk_score_params:
-  vocs: "B.1.1.7, B.1.351, P.1, B.1.617.2, B.1.1.529, BA.2, BA.4, BA.5"
-  mode: "auto"
-  weights: "0.9, 0.05, 0.05"
+  vocs: "B.1.1.7, B.1.351, P.1, B.1.617.2, B.1.1.529, BA.2, BA.4, BA.5" # Syntax is "voc1, voc2, voc3"
+  mode: "auto" # auto or specific
+  # weights to use for the risk score computation. 
+  # The first value are the mutations with an escape score, 
+  # the second are the mutations without an escape score and the third are deletions. 
+  # Only mutations and deletions in the spike protein are considered.
+  weights: "0.9, 0.05, 0.05" 
+  # cutoff value to find the variants of interest. If the mode is "auto", 
+  # the cutoff is set to the lowest risk score of the vocs in the data set.
  cutoff: 0
\ No newline at end of file
--- a/workflow/rules/covsonar.smk
+++ b/workflow/rules/covsonar.smk
+# create the covsonar db used for matching later
 rule create_covsonar_db:
    input:
        fasta=config["sample_sequences"]
@@ -11,6 +12,7 @@ rule create_covsonar_db:
    shell:
        "{params.covsonar_repo}/sonar.py add -f {input.fasta} --db {output.db} --cpus {threads}"

+# add the pango lineages to the db
 rule covsonar_update_lineage:
    input:
        db=RESULT_DIR / "db/sonar_db",
@@ -26,6 +28,7 @@ rule covsonar_update_lineage:
    shell:
        "{params.covsonar_repo}/sonar.py update --db {input.db} --cpus {threads} --csv {input.pango_report} --fields {params.fields}"

+# query the sequences to find all mutations compared to the wuhan reference (NC_045512.2).
 rule covsonar_match:
    input:
        db=RESULT_DIR / "db/sonar_db",

--- a/workflow/rules/pangolin.smk
+++ b/workflow/rules/pangolin.smk
+# assing pango lineage to the sequences
 rule pangolin_assign:
    input:
        fasta=config["sample_sequences"]

--- a/workflow/rules/phylogeny.smk
+++ b/workflow/rules/phylogeny.smk
+# compute multiple sequence alignment
 rule mafft:
    input:
        fasta=config["sample_sequences"]
@@ -11,6 +12,7 @@ rule mafft:
    shell:
        "mafft --auto --thread {threads} {input.fasta} > {output.alignment} 2> {log.stderr_log}"

+# compute the phylogenetic tree in newick format
 rule create_phy_tree_newick:
    input:
        alignment=RESULT_DIR / "mafft/msa.aln"
@@ -33,7 +35,7 @@ rule create_phy_tree_newick:
    shell:
        "iqtree --redo -s {input.alignment} -T {threads} -pre {params.prefix} > {log.stdout_log} 2> {log.stderr_log}"

-
+# visualise the phylogenetic tree
 rule create_tree_pdf:
    input:
        newick_tree=RESULT_DIR / "phylogeny/iqtree/msa.phy.treefile"

--- a/workflow/rules/risk_score.smk
+++ b/workflow/rules/risk_score.smk
+# compute risk score and print the most concerning sequence
 rule compute_risk_score:
    input:
        covsonar_file = RESULT_DIR / "covsonar/matched_seqs.tsv",

--- a/workflow/rules/statistics.smk
+++ b/workflow/rules/statistics.smk
+# count mutations for plotting
 rule count_mutations:
    input:
        tsv=RESULT_DIR / "covsonar/matched_seqs.tsv",
@@ -10,6 +11,7 @@ rule count_mutations:
    script:
        "../scripts/count_mutations.py"

+# plot the counted mutations
 rule plot_mutations:
    input:
        df=RESULT_DIR / "statistics/stats.tsv"
@@ -25,6 +27,7 @@ rule plot_mutations:
    script:
        "../scripts/plot_mutations.py"

+# compute pairwise alignment of all sequences to get the pairwise sequence identities
 rule get_sequence_identity:
    input:
        samples=config["sample_sequences"]
@@ -36,6 +39,7 @@ rule get_sequence_identity:
    script:
        "../scripts/get_seq_identity.py"

+# PCA and kmeans
 rule project_and_cluster:
    input:
        df=RESULT_DIR / "statistics/stats.tsv",