add test data procedure in the README

3956e8da · Rachel LEGENDRE · 3045e84b · 3956e8da
Commit 3956e8da authored 3 years ago by Rachel LEGENDRE
--- a/README.md
+++ b/README.md
@@ -73,6 +73,8 @@ You need to copy the singularity image in the cloned ChIPflow directory and rena
 `mv chipflow_latest.sif chipflow/chipflow.img`
+**Note that you can use public datasets to test the pipeline as described [here](https://gitlab.pasteur.fr/hub/chipflow/-/edit/master/README.md#run-the-pipeline-on-test-data)**
 *  Step 3: Execute workflow
@@ -86,7 +88,7 @@ Execute the workflow locally via
 using $N cores or run it in a cluster environment via
-`snakemake --use-singularity --singularity-args "-B '$HOME'" --cluster-config config/cluster_config.json --cluster "sbatch --mem={cluster.ram} --cpus-per-task={threads} " -j 200 --nolock --cores $SLURM_JOB_CPUS_PER_NODE`
+`snakemake --use-singularity --singularity-args "-B '$HOME'" --cluster-config config/cluster_config.json --cluster "sbatch --mem={cluster.ram} --cpus-per-task={threads} " -j 200 --nolock --cores 1`
 Visualize how the rules are connected via 
@@ -206,6 +208,53 @@ report_header_info:
 ```
 <img src="images/multiqc_header.png" width="600">
+### Run the pipeline on test data 
+You need to have sra-toolkit installed before to download test data.
+```
+# Download genome references
+wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz
+gunzip GCF_000001635.26_GRCm38.p6_genomic.fna.gz
+# get blacklisted regions
+wget http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/mm10-mouse/mm10.blacklist.bed.gz
+gunzip mm10.blacklist.bed.gz
+# create genome directory
+mkdir genome
+mv GCF_000001635.26_GRCm38.p6_genomic.fna genome/mm10.fa
+mv mm10.blacklist.bed genome/mm10.blacklist.bed
+# copy config file
+cp test/config.yaml config/config.yaml
+cp test/design.txt config/design.txt
+# Download FastQ files from GEO (GSE99009) https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99009
+# Only include the H3K27ac histone mark and Klf4 transcription factor with their associated inputs for the shUbc9 and shCtrl conditions
+SRR=("SRR5572646" "SRR5572647" "SRR5572658" "SRR5572659" "SRR5572668" "SRR5572669" "SRR5572676" "SRR5572677" "SRR5572652" "SRR5572653" "SRR5572664" "SRR5572665")
+sample=("H3K27ac_shCtrl_Rep1_R1" "H3K27ac_shCtrl_Rep2_R1" "H3K27ac_shUbc9_Rep1_R1" "H3K27ac_shUbc9_Rep2_R1" "Klf4_shCtrl_Rep1_R1" "Klf4_shCtrl_Rep2_R1" "Klf4_shUbc9_Rep1_R1" "Klf4_shUbc9_Rep2_R1" "INPUT_shCtrl_Rep1_R1" "INPUT_shCtrl_Rep2_R1" "INPUT_shUbc9_Rep1_R1" "INPUT_shUbc9_Rep2_R1")
+mkdir data
+cd data
+for i in  ${!SRR[*]} ; do
+    echo ${SRR[$i]}, ${sample[$i]}
+    prefetch ${SRR[$i]} -o ${sample[$i]}.sra
+    fastq-dump ${sample[$i]}.sra
+done
+rm *.sra 
+for file in *.fastq ; do 
+    pigz $file ; 
+done
+```
 ## How to cite ChIPflow ?
 https://doi.org/10.1101/2021.02.02.429342