Initial commit

68d9c918 · dimit98 · 68d9c918 · 68d9c918 · 68d9c918 · 68d9c918
Commit 68d9c918 authored 4 years ago by dimit98
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "scripts/Specter"]
+	path = scripts/Specter
+	url = https://github.com/canzarlab/Specter
--- a/Snakefile
+++ b/Snakefile
+import os
+import pandas as pd
+configfile: "config.yaml"
+samples = pd.read_table(config["data"]["samples"], index_col="sample")
+units = pd.read_table(config["data"]["units"])
+rule all:
+        input:
+                "figures/slingshot_data_vis.jpg","figures/monocle_trajectory.jpg", dynamic("figures/{cluster}_enrichment_vis.pdf") if len(units.columns) != 3 else [] 
+include: "rules/preprocessing.smk"
+include: "rules/read_in.smk"
+include: "rules/quality_control.smk"
+include: "rules/normalization.smk"
+include: "rules/visualization.smk"
+include: "rules/clustering.smk"
+include: "rules/slingshot_R.smk"
+include: "rules/monocle2.smk"
+include: "rules/differential_testing.smk"
+include: "rules/gene_set_analysis.smk"
--- a/config.yaml
+++ b/config.yaml
+#User settings:
+data:
+  samples: "" #path to the samples-file
+  units: "" #path to the units-file
+general:
+  amount_of_hvgs: "" #amount of highly variable genes used in the analysis
+preprocessing:
+  path_to_bamtofastq: "" #path to the bamtofastq program 
+  path_to_cellranger: "" #path to the cellranger program
+  path_to_ref: "" #path to the reference genome used
+quality_control:
+  cells: #filter of cells
+    upper_quantile_counts: "0.95"  #maximum counts per cell; use one parameter, leave the other as ""
+    max_counts: ""
+    lower_quantile_counts: "0.05"  #minimum counts per cell; use one paramter, leave the other as "" 
+    min_counts: ""
+    lower_quantile_genes: "0.05" #minimum amount of genes per cell; use one, leave the other as ""
+    min_genes: ""
+    mt_frac: "" #maximum proportion of mitochondrial genes per cell
+  genes:
+    min_cells: "" #minimum amount of cells a gene has to be expressed in
+downsampling:
+  downsampling_method: "normal" #options: "sphetcher" or "normal"; "normal" =  no downsampling
+  path_to_sphetcher: "" #path to sphetchers src-folder (only used if downsampling_method = "sphetcher")
+  sketch_size: "" #amount of cells after downsampling (only used if downsampling_method = "sphetcher")
+cell_cycle_scoring:
+  ref_genes: "" #used for visualization of cell cycle effects; tsv-file with 2 columns(s and g2m) defining cell cycle genes; OPTIONAL 
+clustering:
+  celltypes_markergenes: "" #known celltypes with their marker genes; tsv-file with 2 columns(celltype and markergenes), celltype definies a specific celltype & markergenes its markergenes as a list
+  genes_to_vis: ""   #genes that are visualized across the clustering; OPTIONAL
+  subclustering: ""    #options: "" or "True"; "" = no automatic subclustering
+  clustering_resolution: "0.5" #sensitivity parameter for the louvain-algorithm
+  cluster_method: "louvain" #options: "specter" or "louvain"; used algorithm for clustering
+specter: #parameters only used if clustering:cluster_method = "specter"
+  number_of_clusters: "8" #number of clusters in the final clustering
+  ensemble_size: "200" 
+  mingamma: "0.1"
+subclustering:
+  subclustering_resolution: "0.2" #sensitivity parameter for the lovain-algorithm
+  names_for_unannotated: "" #names to use for the unannotated clusters; OPTIONAL
+  further_subclusterings: "" #clusters to subcluster; OPTIONAL
+trajectory_inference:
+  clusters_to_include: "" #clusters that are used for trajectory inference; all clusters used if parameter is "" 
+  trajectory_start: "" #cluster thats the starting point of the trajectory; random cluster chosen if parameter is ""
+  trajectory_ending: "" #cluster thats the end point of the trajectory; random cluster chosen if parameter is ""
+differential_testing:
+  clusters: "" #clusters used for differential testing, if a subcluster is supposed to be tested: write cluster-0 instead of cluster,0; all clusters used if parameter is ""
+  DE_threshold: "0.01" #threshold for p-values of differentially expressed genes 
+gene_set_enrichment_analysis:
+  enrichment_threshold: "0.05" #threshold for p-values of enriched GO-terms  
+  organism: "" #analysed organism
--- a/env/differential_testing.yaml
+++ b/env/differential_testing.yaml
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - bioconductor-singlecellexperiment
+  - bioconductor-mast
+  - r-seurat==3.2.0
+  - bioconductor-scater
+  - r-hdf5r
+  - bioconductor-rhdf5
--- a/env/enrichment_vis.yaml
+++ b/env/enrichment_vis.yaml
+channels:
+  - r
+dependencies:
+  - r-dplyr
+  - r-ggplot2
--- a/env/gene_set_analysis.yaml
+++ b/env/gene_set_analysis.yaml
+channels:
+  - bioconda
+  - conda-forge
+  - anaconda
+dependencies:
+  - gprofiler-official
+  - pandas
+  - matplotlib
--- a/env/monocle2.yaml
+++ b/env/monocle2.yaml
+channels: 
+  - bioconda
+  - conda-forge
+dependencies:
+  - bioconductor-monocle
+  - bioconductor-biobase
+  - r-rcolorbrewer
+  - r-base
--- a/env/oneforall.yaml
+++ b/env/oneforall.yaml
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - anndata==0.6.22.post1
+  - seaborn
+  - umap-learn=0.3.9
+  - scipy
+  - scanpy
+  - python-igraph
--- a/env/oneforall_subclustering.yaml
+++ b/env/oneforall_subclustering.yaml
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - anndata==0.6.22.post1
+  - seaborn
+  - umap-learn=0.3.9
+  - scipy
+  - scanpy
+  - python-igraph
+  - h5py==2.9.0
--- a/env/r_norm.yaml
+++ b/env/r_norm.yaml
+channels:
+  - conda-forge
+  - bioconda
+  - r
+dependencies:
+  - bioconductor-scran
+  - r==3.5.1
+  - r-base
--- a/env/slingshot_R.yaml
+++ b/env/slingshot_R.yaml
+channels:
+  - r
+  - bioconda
+  - conda-forge
+dependencies:
+  - r-rcolorbrewer
+  - bioconductor-slingshot
+  - bioconductor-singlecellexperiment==1.10.1
+  - r
+  - r-seurat
+  - bioconductor-scater
+  - bioconductor-clusterexperiment
+  - r-gam
+  - r-base
+  - r-hdf5r 
+  - bioconductor-rhdf5
--- a/report/annotated_trajectory.rst
+++ b/report/annotated_trajectory.rst
+Visualisierung der annotierten Trajektorie.
--- a/report/annotated_trajectory_nbc.rst
+++ b/report/annotated_trajectory_nbc.rst
+Visualisierung der annotierten Trajektorie auf Basis der nicht batch-korrigierten Daten.
--- a/report/diffmap_datavis.rst
+++ b/report/diffmap_datavis.rst
+Visualisierung von dimensionsreduzierten Daten mit Hilfe von diffusion maps.
--- a/report/diffmap_diffusion_pseudotime_1.rst
+++ b/report/diffmap_diffusion_pseudotime_1.rst
+Visualisierung der sog. Diffusion Pseudotime, welche die Daten in einer Abfolge, basierend auf transkriptionellen Ähnlichkeiten zwischen den Zellen, visualisiert.
--- a/report/diffmap_diffusion_pseudotime_2.rst
+++ b/report/diffmap_diffusion_pseudotime_2.rst
+Visualisierung der sog. Diffusion Pseudotime, welche die Daten in einer Abfolge, basierend auf transkriptionellen Ähnlichkeiten zwischen den Zellen, visualisiert.
--- a/report/draw_graph_fr_datavis.rst
+++ b/report/draw_graph_fr_datavis.rst
+Visualisierung von dimensionsreduzierten Daten mit Hilfe des Drawgraph Algorithmus.
--- a/report/enrichment_vis.rst
+++ b/report/enrichment_vis.rst
+Visualisierung der 10 am signifikantesten angereicherten GO-Terme für das Cluster: {{snakemake.wildcards.cluster}}. Die GO-Terme sind auf der y-Achse dargestellt, während auf der x-Achse das Verhältnis zwischen differentiell exprimierten Genen eines GO-Terms und nicht differentiell exprimierten Genen dieses GO-Terms dargestellt wird. Farblich markiert ist der P-Wert und die Größe der Kreise markiert die Anzahl differentiell exprimierter Gene.
--- a/report/gene_dispersion.rst
+++ b/report/gene_dispersion.rst
+Abgebildet ist die Dispersion der Gene in normalisierter und nicht-normalisierter Form in Abhängigkeit von den durchschnittlichen Expressionen der Gene. Stark variable Gene sind dabei durch die dunkle Farbe hervorgehoben.
--- a/report/gene_ranking.rst
+++ b/report/gene_ranking.rst
+Visualisierung der am stärksten exprimierten Gene in jedem Cluster.