Skip to content
This repository has been archived by the owner on Apr 12, 2024. It is now read-only.

Commit

Permalink
end of day update
Browse files Browse the repository at this point in the history
  • Loading branch information
Jermiah committed Sep 13, 2023
1 parent 2f8b7a3 commit 279664d
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 158 deletions.
14 changes: 11 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from os.path import join, basename,dirname
from os.path import join, basename, dirname, splitext
import re
import pandas as pd

Expand All @@ -25,8 +25,16 @@ reference_genome = config["ref"]
SRA_METADATA_FILE = "metadata/sra_metadata.csv"
sra_metadata = pd.read_csv(SRA_METADATA_FILE)
sample_accessions = sra_metadata["run_accession"].tolist()
# sample_accessions = 'SRR8615581'
sample_accessions = sample_accessions[1]
julia_list = [
"SRR8615281", "SRR8615282", "SRR8615300", "SRR8615338", "SRR8615368", "SRR8615369", "SRR8615376",
"SRR8615398", "SRR8615459", "SRR8615475", "SRR8615504", "SRR8615506", "SRR8615561", "SRR8615564",
"SRR8615580", "SRR8615581", "SRR8615671", "SRR8615709", "SRR8615749", "SRR8615753", "SRR8615788",
"SRR8615803", "SRR8615817", "SRR8615854", "SRR8615856", "SRR8615896", "SRR8615924", "SRR8615933",
"SRR8615934", "SRR8616011", "SRR8616014", "SRR8616032", "SRR8616033", "SRR8616044", "SRR8616058",
"SRR8616142", "SRR8616177", "SRR8618304", "SRR8618306", "SRR8618307"
]
# sample_accessions = sample_accessions[1]
sample_accessions = julia_list
# SRA_METADATA_FILE = "metadata/gCSI_metadata.csv"
# gCSI_metadata = pd.read_csv(gCSI_METADATA_FILE)
# sample_accessions = "586986_1"
Expand Down
Binary file modified dag.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion workflow/envs/bwa_index.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ channels:
- bioconda
- nodefaults
dependencies:
- bwa =0.7.17
- bwa = 0.7.17
7 changes: 3 additions & 4 deletions workflow/envs/bwa_mem.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ channels:
- bioconda
- nodefaults
dependencies:
- bwa-meme =1.0.6
- bwa = 0.7.17
- samtools =1.17
- samblaster =0.1.26
- mbuffer =20160228
- picard-slim =3.0.0
- picard-slim =3.0.0
- snakemake-wrapper-utils =0.6.2
7 changes: 4 additions & 3 deletions workflow/rules/bwa.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@
rule bwa_mem:
input:
reads=get_fastq_pe,
idx_genome=join(ref_path, "bwa", "genome.fa"),
idx=multiext(join(ref_path, "bwa", "genome"), ".amb", ".ann", ".bwt", ".pac", ".sa"),
output:
output=join("processed_data/{PROJECT_NAME}/", "alignment/{sample}.sam")
output=join("procdata/{PROJECT_NAME}/", "alignment/{sample}.sam")
params:
extra=r"-R '@RG\tID:{wildcards.sample}\tSM:{wildcards.sample}'"
extra=r"-R '@RG\tID:{sample}\tSM:{sample}'"
conda:
"../envs/bwa_mem.yaml"
threads: 32
resources:
machine_type = machines['high_mem']['name']
shell:
"bwa mem -t {threads} {params.extra} {input.idx[0]} {input.reads} > {output}"
"ls -la $(dirname {input.idx_genome}); bwa mem -t {threads} $(dirname {input.idx_genome})/genome {input.reads} > {output}"

rule build_bwa_index:
input:
Expand Down
143 changes: 26 additions & 117 deletions workflow/rules/circularRNA.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

rule CIRI2:
input:
sam=join("processed_data/{PROJECT_NAME}/", "alignment/{sample}.sam"),
sam=join("procdata/{PROJECT_NAME}/", "alignment/{sample}.sam"),
gtf=join(ref_path, "bwa", "annotation.gtf"),
idx=multiext(join(ref_path, "bwa", "genome"), ".fa", ".amb", ".ann", ".bwt", ".pac", ".sa")
output:
Expand All @@ -11,8 +11,6 @@ rule CIRI2:
log: "log/{PROJECT_NAME}/CIRI2/{sample}.log"
container:
"docker://andremrsantos/ciri2:latest"
resources:
machine_type = machines['small_cpu']['name']
shell:
"""CIRI2 \
--thread_num {threads} \
Expand All @@ -22,121 +20,32 @@ rule CIRI2:
--ref_file {input.idx[0]} \
--log {log}"""

rule circRNA_finder:
input:
aln="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_pe_aligned.sam",
log="procdata/{PROJECT_NAME}/logs/pe/{sample}/{sample}_Log.out",
sj="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_SJ.out.tab",
chim_junc="procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_Chimeric.out.junction",
unmapped=["procdata/{PROJECT_NAME}/star/pe/{sample}/{sample}_unmapped.1.fastq.gz","{procdata}/{PROJECT_NAME}/star/pe/{sample}/{sample}_unmapped.2.fastq.gz"],
output:
filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_filteredJunctions.bed",
GT_AG_filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_s_filteredJunctions.bed",
fw_filteredjunctions="results/{PROJECT_NAME}/circRNA_finder/_s_filteredJunctions_fw.bed",
sorted_bam="results/{PROJECT_NAME}/circRNA_finder/Chimeric.out.sorted.bam",
sorted_indexed_bam="results/{PROJECT_NAME}/circRNA_finder/Chimeric.out.sorted.bam.bai"

# a) _filteredJunctions.bed:
# A bed file with all circular junctions found by the pipeline.
# The score column indicates the number reads spanning each junction.

# rule:
# input:
# b) _s_filteredJunctions.bed:
# A bed file with those juction in (a) that are flanked by GT-AG splice sites.
# The score column indicates the number reads spanning each junction.

# output:
# c) _s_filteredJunctions_fw.bed:
# A bed file with the same circular junctions as in file (b),
# but here the score column gives the average number of forward spliced reads at both splice sites around each circular junction.

# shell:
# """
# STAR \
# --runThreadN {threads} \
# --genomeDir {params.index} \
# --readFilesIn {input.reads1} {input.reads2} \
# --readFilesCommand zcat \
# --outFileNamePrefix {params.outFileNamePrefix} \
# --outSAMattributes All \
# --outStd BAM_Unsorted \
# --outSAMtype BAM Unsorted \
# --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
# {params.additional_params} \
# > {output.bam};
# """
# d) (Sorted and indexed) bam file with all chimeric reads identified by STAR.
# The circRNA junction spanning reads are a subset of these.

# rule pe_map_genome_star:
# """
# Map to genome using STAR
# """
# input:
# index=lambda wildcards: os.path.join(
# config["star_indexes"],
# get_sample("organism", search_id="index", search_value=wildcards.sample),
# get_sample("index_size", search_id="index", search_value=wildcards.sample),
# "STAR_index",
# "chrNameLength.txt",
# ),
# reads1=os.path.join(
# config["output_dir"],
# "samples",
# "{sample}",
# "{sample}.fq1.pe.remove_polya.fastq.gz",
# ),
# reads2=os.path.join(
# config["output_dir"],
# "samples",
# "{sample}",
# "{sample}.fq2.pe.remove_polya.fastq.gz",
# ),
# output:
# bam=os.path.join(
# config["output_dir"],
# "samples",
# "{sample}",
# "map_genome",
# "{sample}.pe.Aligned.out.bam",
# ),
# logfile=os.path.join(
# config["output_dir"],
# "samples",
# "{sample}",
# "map_genome",
# "{sample}.pe.Log.final.out",
# ),
# params:
# cluster_log_path=config["cluster_log_dir"],
# sample_id="{sample}",
# index=lambda wildcards: os.path.abspath(
# os.path.join(
# config["star_indexes"],
# get_sample(
# "organism", search_id="index", search_value=wildcards.sample
# ),
# get_sample(
# "index_size", search_id="index", search_value=wildcards.sample
# ),
# "STAR_index",
# )
# ),
# outFileNamePrefix=lambda wildcards, output: output.bam.replace(
# "Aligned.out.bam", ""
# ),
# additional_params=parse_rule_config(
# rule_config,
# current_rule=current_rule,
# immutable=(
# "--genomeDir",
# "--readFilesIn",
# "--readFilesCommand",
# "--outFileNamePrefix",
# "--outSAMattributes",
# "--outStd",
# "--outSAMtype",
# "--outSAMattrRGline",
# ),
# ),
# container:
# "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1"
# conda:
# os.path.join(workflow.basedir, "envs", "STAR.yaml")
# threads: 12
# resources:
# mem_mb=lambda wildcards, attempt: 32000 * attempt,
# log:
# stderr=os.path.join(
# config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log"
# ),
# shell:
# "(STAR \
# --runThreadN {threads} \
# --genomeDir {params.index} \
# --readFilesIn {input.reads1} {input.reads2} \
# --readFilesCommand zcat \
# --outFileNamePrefix {params.outFileNamePrefix} \
# --outSAMattributes All \
# --outStd BAM_Unsorted \
# --outSAMtype BAM Unsorted \
# --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample_id} \
# {params.additional_params} \
# > {output.bam};) \
# 2> {log.stderr}"
12 changes: 12 additions & 0 deletions workflow/rules/kallisto.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rule kallisto_index:
input:
fasta="{transcriptome}.fasta",
output:
index="{transcriptome}.idx",
params:
extra="", # optional parameters
log:
"logs/kallisto_index_{transcriptome}.log",
threads: 1
wrapper:
"v2.6.0/bio/kallisto/index"
30 changes: 0 additions & 30 deletions workflow/rules/star.smk
Original file line number Diff line number Diff line change
Expand Up @@ -51,33 +51,3 @@ rule star_pe_multi:
threads: 32
wrapper:
"v2.6.0/bio/star/align"
# shell:
# "STAR \
# --runThreadN {threads} \
# --genomeDir {input.idx} \
# --readFilesIn {input.fq1} {input.fq2} \
# --readFilesCommand gunzip -c \
# {params.extra} \
# --outReadsUnmapped Fastx \
# --outFileNamePrefix {params.outFileNamePrefix} \
# --outStd BAM_SortedByCoordinate \
# > {output.aln};"

# script:
# "../scripts/map_star.py"
# """
# STAR \
# --genomeDir $genomeDir \
# --readFilesCommand gunzip -c --readFilesIn $inFile1 $inFile2 \
# --runThreadN $nThreads \
# --chimSegmentMin $chimSegMin \
# --chimScoreMin 1 \
# --alignIntronMax $alignIntronMax \
# --outFilterMismatchNoverReadLmax $maxMismatchFraction \
# --alignTranscriptsPerReadNmax $alignTxPerReadMax \
# --twopassMode Basic \
# --outSAMtype BAM SortedByCoordinate \
# --chimOutType Junctions SeparateSAMold \
# --outFilterMultimapNmax 2 \
# --outFileNamePrefix $outPrefix
# """

0 comments on commit 279664d

Please sign in to comment.