end of day update

jjjermiah · Sep 13, 2023 · 279664d · 279664d
1 parent 2f8b7a3
commit 279664d
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 158 deletions.
diff --git a/Snakefile b/Snakefile
@@ -1,4 +1,4 @@
-from os.path import join, basename,dirname
+from os.path import join, basename, dirname, splitext
 import re
 import pandas as pd
 
@@ -25,8 +25,16 @@ reference_genome = config["ref"]
 SRA_METADATA_FILE = "metadata/sra_metadata.csv"
 sra_metadata = pd.read_csv(SRA_METADATA_FILE)
 sample_accessions = sra_metadata["run_accession"].tolist()
-# sample_accessions = 'SRR8615581'
-sample_accessions = sample_accessions[1]
+julia_list = [
+    "SRR8615281", "SRR8615282", "SRR8615300", "SRR8615338", "SRR8615368", "SRR8615369", "SRR8615376",
+    "SRR8615398", "SRR8615459", "SRR8615475", "SRR8615504", "SRR8615506", "SRR8615561", "SRR8615564",
+    "SRR8615580", "SRR8615581", "SRR8615671", "SRR8615709", "SRR8615749", "SRR8615753", "SRR8615788",
+    "SRR8615803", "SRR8615817", "SRR8615854", "SRR8615856", "SRR8615896", "SRR8615924", "SRR8615933",
+    "SRR8615934", "SRR8616011", "SRR8616014", "SRR8616032", "SRR8616033", "SRR8616044", "SRR8616058",
+    "SRR8616142", "SRR8616177", "SRR8618304", "SRR8618306", "SRR8618307"
+]
+# sample_accessions = sample_accessions[1]
+sample_accessions = julia_list
 # SRA_METADATA_FILE = "metadata/gCSI_metadata.csv"
 # gCSI_metadata = pd.read_csv(gCSI_METADATA_FILE)
 # sample_accessions = "586986_1"

diff --git a/dag.pdf b/dag.pdf
diff --git a/workflow/envs/bwa_index.yaml b/workflow/envs/bwa_index.yaml
@@ -3,4 +3,4 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - bwa =0.7.17
+  - bwa = 0.7.17
diff --git a/workflow/envs/bwa_mem.yaml b/workflow/envs/bwa_mem.yaml
@@ -3,8 +3,7 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - bwa-meme =1.0.6
+  - bwa = 0.7.17
   - samtools =1.17
-  - samblaster =0.1.26
-  - mbuffer =20160228
-  - picard-slim =3.0.0
+  - picard-slim =3.0.0
+  - snakemake-wrapper-utils =0.6.2
diff --git a/workflow/rules/bwa.smk b/workflow/rules/bwa.smk
@@ -2,18 +2,19 @@
 rule bwa_mem:
     input:
         reads=get_fastq_pe,
+        idx_genome=join(ref_path, "bwa", "genome.fa"),
         idx=multiext(join(ref_path, "bwa", "genome"), ".amb", ".ann", ".bwt", ".pac", ".sa"),
     output:
-        output=join("processed_data/{PROJECT_NAME}/", "alignment/{sample}.sam")
+        output=join("procdata/{PROJECT_NAME}/", "alignment/{sample}.sam")
     params:
-        extra=r"-R '@RG\tID:{wildcards.sample}\tSM:{wildcards.sample}'"
+        extra=r"-R '@RG\tID:{sample}\tSM:{sample}'"
     conda:
         "../envs/bwa_mem.yaml"
     threads: 32
     resources:
         machine_type = machines['high_mem']['name']
     shell:
-        "bwa mem -t {threads} {params.extra} {input.idx[0]} {input.reads} > {output}"
+        "ls -la $(dirname {input.idx_genome}); bwa mem -t {threads} $(dirname {input.idx_genome})/genome {input.reads} > {output}"
 
 rule build_bwa_index:
     input:

diff --git a/workflow/rules/circularRNA.smk b/workflow/rules/circularRNA.smk
@@ -2,7 +2,7 @@
 
 rule CIRI2:
     input:
-        sam=join("processed_data/{PROJECT_NAME}/", "alignment/{sample}.sam"),
+        sam=join("procdata/{PROJECT_NAME}/", "alignment/{sample}.sam"),
         gtf=join(ref_path, "bwa", "annotation.gtf"),
         idx=multiext(join(ref_path, "bwa", "genome"), ".fa", ".amb", ".ann", ".bwt", ".pac", ".sa")
     output:
@@ -11,8 +11,6 @@ rule CIRI2:
     log: "log/{PROJECT_NAME}/CIRI2/{sample}.log"
     container:
         "docker://andremrsantos/ciri2:latest"
-    resources:
-        machine_type = machines['small_cpu']['name']
     shell:
         """CIRI2 \
         --thread_num {threads} \
@@ -22,121 +20,32 @@ rule CIRI2:
         --ref_file {input.idx[0]} \
         --log {log}"""
 
+rule circRNA_finder:
+    input:
+        aln="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_pe_aligned.sam",
+        log="procdata/{PROJECT_NAME}/logs/pe/{sample}/{sample}_Log.out",
+        sj="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_SJ.out.tab",
+        chim_junc="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_Chimeric.out.junction",
+        unmapped=["procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_unmapped.1.fastq.gz","{procdata}/{PROJECT_NAME}/star/pe/{sample}/{sample}_unmapped.2.fastq.gz"],
+    output:
+        filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_filteredJunctions.bed",
+        GT_AG_filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_s_filteredJunctions.bed",
+        fw_filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_s_filteredJunctions_fw.bed",
+        sorted_bam="results/{PROJECT_NAME}/circRNA_finder/Chimeric.out.sorted.bam",
+        sorted_indexed_bam="results/{PROJECT_NAME}/circRNA_finder/Chimeric.out.sorted.bam.bai"
+
+# a) _filteredJunctions.bed: 
+    # A bed file with all circular junctions found by the pipeline. 
+    # The score column indicates the number reads spanning each junction.
 
-# rule: 
-#     input:
+# b) _s_filteredJunctions.bed: 
+    # A bed file with those juction in (a) that are flanked by GT-AG splice sites. 
+    # The score column indicates the number reads spanning each junction.
 
-#     output:
+# c) _s_filteredJunctions_fw.bed: 
+    # A bed file with the same circular junctions as in file (b), 
+    # but here the score column gives the average number of forward spliced reads at both splice sites around each circular junction.
 
-#     shell:
-#         """
-#         STAR \
-#         --runThreadN {threads} \
-#         --genomeDir {params.index} \
-#         --readFilesIn {input.reads1} {input.reads2} \
-#         --readFilesCommand zcat \
-#         --outFileNamePrefix {params.outFileNamePrefix} \
-#         --outSAMattributes All \
-#         --outStd BAM_Unsorted \
-#         --outSAMtype BAM Unsorted \
-#         --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
-#         {params.additional_params} \
-#         > {output.bam};
-#         """
+# d) (Sorted and indexed) bam file with all chimeric reads identified by STAR. 
+    # The circRNA junction spanning reads are a subset of these.
 
-# rule pe_map_genome_star:
-#     """
-#         Map to genome using STAR
-#     """
-#     input:
-#         index=lambda wildcards: os.path.join(
-#             config["star_indexes"],
-#             get_sample("organism", search_id="index", search_value=wildcards.sample),
-#             get_sample("index_size", search_id="index", search_value=wildcards.sample),
-#             "STAR_index",
-#             "chrNameLength.txt",
-#         ),
-#         reads1=os.path.join(
-#             config["output_dir"],
-#             "samples",
-#             "{sample}",
-#             "{sample}.fq1.pe.remove_polya.fastq.gz",
-#         ),
-#         reads2=os.path.join(
-#             config["output_dir"],
-#             "samples",
-#             "{sample}",
-#             "{sample}.fq2.pe.remove_polya.fastq.gz",
-#         ),
-#     output:
-#         bam=os.path.join(
-#             config["output_dir"],
-#             "samples",
-#             "{sample}",
-#             "map_genome",
-#             "{sample}.pe.Aligned.out.bam",
-#         ),
-#         logfile=os.path.join(
-#             config["output_dir"],
-#             "samples",
-#             "{sample}",
-#             "map_genome",
-#             "{sample}.pe.Log.final.out",
-#         ),
-#     params:
-#         cluster_log_path=config["cluster_log_dir"],
-#         sample_id="{sample}",
-#         index=lambda wildcards: os.path.abspath(
-#             os.path.join(
-#                 config["star_indexes"],
-#                 get_sample(
-#                     "organism", search_id="index", search_value=wildcards.sample
-#                 ),
-#                 get_sample(
-#                     "index_size", search_id="index", search_value=wildcards.sample
-#                 ),
-#                 "STAR_index",
-#             )
-#         ),
-#         outFileNamePrefix=lambda wildcards, output: output.bam.replace(
-#             "Aligned.out.bam", ""
-#         ),
-#         additional_params=parse_rule_config(
-#             rule_config,
-#             current_rule=current_rule,
-#             immutable=(
-#                 "--genomeDir",
-#                 "--readFilesIn",
-#                 "--readFilesCommand",
-#                 "--outFileNamePrefix",
-#                 "--outSAMattributes",
-#                 "--outStd",
-#                 "--outSAMtype",
-#                 "--outSAMattrRGline",
-#             ),
-#         ),
-#     container:
-#         "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"
-#     conda:
-#         os.path.join(workflow.basedir, "envs", "STAR.yaml")
-#     threads: 12
-#     resources:
-#         mem_mb=lambda wildcards, attempt: 32000 * attempt,
-#     log:
-#         stderr=os.path.join(
-#             config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log"
-#         ),
-#     shell:
-#         "(STAR \
-#         --runThreadN {threads} \
-#         --genomeDir {params.index} \
-#         --readFilesIn {input.reads1} {input.reads2} \
-#         --readFilesCommand zcat \
-#         --outFileNamePrefix {params.outFileNamePrefix} \
-#         --outSAMattributes All \
-#         --outStd BAM_Unsorted \
-#         --outSAMtype BAM Unsorted \
-#         --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
-#         {params.additional_params} \
-#         > {output.bam};) \
-#         2> {log.stderr}"
diff --git a/workflow/rules/kallisto.smk b/workflow/rules/kallisto.smk
@@ -0,0 +1,12 @@
+rule kallisto_index:
+    input:
+        fasta="{transcriptome}.fasta",
+    output:
+        index="{transcriptome}.idx",
+    params:
+        extra="",  # optional parameters
+    log:
+        "logs/kallisto_index_{transcriptome}.log",
+    threads: 1
+    wrapper:
+        "v2.6.0/bio/kallisto/index"
diff --git a/workflow/rules/star.smk b/workflow/rules/star.smk
@@ -51,33 +51,3 @@ rule star_pe_multi:
     threads: 32
     wrapper:
         "v2.6.0/bio/star/align"
-    # shell:
-    #     "STAR \
-    #     --runThreadN {threads} \
-    #     --genomeDir {input.idx} \
-    #     --readFilesIn {input.fq1} {input.fq2} \
-    #     --readFilesCommand gunzip -c \
-    #     {params.extra} \
-    #     --outReadsUnmapped Fastx \
-    #     --outFileNamePrefix  {params.outFileNamePrefix} \
-    #     --outStd BAM_SortedByCoordinate \
-    #     > {output.aln};"
-
-    # script:
-    #     "../scripts/map_star.py"
-    # """
-    # STAR \
-    # --genomeDir $genomeDir \
-    # --readFilesCommand gunzip -c --readFilesIn $inFile1 $inFile2 \
-    # --runThreadN $nThreads \
-    # --chimSegmentMin $chimSegMin \
-    # --chimScoreMin 1 \
-    # --alignIntronMax $alignIntronMax \
-    # --outFilterMismatchNoverReadLmax $maxMismatchFraction \
-    # --alignTranscriptsPerReadNmax $alignTxPerReadMax \
-    # --twopassMode Basic \
-    # --outSAMtype BAM SortedByCoordinate \
-    # --chimOutType Junctions SeparateSAMold \
-    # --outFilterMultimapNmax 2 \
-    # --outFileNamePrefix $outPrefix
-    # """