read-depth profiles

lindenb · Jul 20, 2020 · 9dd7788 · 9dd7788
1 parent 1ee020e
commit 9dd7788
Show file tree

Hide file tree

Showing 9 changed files with 1,545 additions and 8 deletions.
diff --git a/R/rd.R b/R/rd.R
@@ -0,0 +1,43 @@
+library(ggplot2)
+library(scales)
+library(gtable)
+library(grid)
+
+chrNamesLong = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21", "chr22", "chrX")
+chrNamesShort = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X")
+
+args = commandArgs(trailingOnly=TRUE)
+x = read.table(args[1], header=T)
+maxCN = 8
+
+# Fix chromosome ordering
+if (sum(x$chr %in% chrNamesLong) > sum(x$chr %in% chrNamesShort)) { chrs = chrNamesLong; } else { chrs = chrNamesShort; }
+x = x[x$chr %in% chrs,]
+x$chr = factor(x$chr, levels=chrs)
+
+# Whole genome
+p = ggplot(data=x, aes(x=start, y=x[,6]))
+p = p + geom_point(pch=21, size=0.5)
+p = p + xlab("Chromosome")
+p = p + ylab("Copy-number")
+p = p + scale_x_continuous(labels=comma)
+p = p + facet_grid(. ~ chr, scales="free_x", space="free_x")
+p = p + ylim(0, maxCN)
+p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
+ggsave(p, file="plot.wholegenome.png", width=24, height=6)
+print(warnings())
+
+# By chromosome
+for(chrname in unique(x$chr)) {
+ print(chrname)
+ sub = x[x$chr == chrname,]
+ p = ggplot(data=sub, aes(x=start, y=sub[,6]))
+ p = p + geom_point(pch=21, size=0.5)
+ p = p + ylab("Copy-number") + xlab(chrname)
+ p = p + scale_x_continuous(labels=comma, breaks = scales::pretty_breaks(n=20))
+ p = p + ylim(0, maxCN)
+ p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
+ ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6)
+ print(warnings())
+}
+
diff --git a/README.md b/README.md
@@ -113,6 +113,24 @@ Germline SV calling
 
 `delly filter -f germline -o germline.bcf merged.bcf`
 
+
+Read-depth profiles
+-------------------
+
+You can generate read-depth profiles with delly. This requires a mappability map which can be downloaded here:
+
+[Mappability Maps](https://gear.embl.de/data/delly/)
+
+The command to count reads in 10kbp windows and normalize the coverage is:
+
+`delly rd -a -g hg19.fa -m hg19.map input.bam`
+
+The output file can be plotted using R to generate normalized copy-number profiles:
+
+`Rscript R/rd.R out.cov.gz`
+
+
+
 FAQ
 ---
 * What is the smallest SV size Delly can call?  

diff --git a/src/bed.h b/src/bed.h
@@ -0,0 +1,119 @@
+#ifndef BED_H
+#define BED_H
+
+#include <boost/filesystem.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/progress.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/distributions/hypergeometric.hpp>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+
+#include <htslib/faidx.h>
+#include <htslib/vcf.h>
+#include <htslib/sam.h>
+
+namespace torali
+{
+
+  // Flattens overlapping intervals
+  template<typename TRegionsGenome>
+  inline int32_t
+    _parseBedIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) {
+    typedef typename TRegionsGenome::value_type TChrIntervals;
+    typedef typename TChrIntervals::interval_type TIVal;
+
+    int32_t intervals = 0;
+    if (filePresent) {
+      bedRegions.resize(hdr->n_targets, TChrIntervals());
+      std::ifstream chrFile(filename.c_str(), std::ifstream::in);
+      if (chrFile.is_open()) {
+	while (chrFile.good()) {
+	  std::string chrFromFile;
+	  getline(chrFile, chrFromFile);
+	  typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
+	  boost::char_separator<char> sep(" \t,;");
+	  Tokenizer tokens(chrFromFile, sep);
+	  Tokenizer::iterator tokIter = tokens.begin();
+	  if (tokIter!=tokens.end()) {
+	    std::string chrName = *tokIter++;
+	    int32_t tid = bam_name2id(hdr, chrName.c_str());
+	    if (tid >= 0) {
+	      if (tokIter!=tokens.end()) {
+		int32_t start = boost::lexical_cast<int32_t>(*tokIter++);
+		int32_t end = boost::lexical_cast<int32_t>(*tokIter++);
+		bedRegions[tid].insert(TIVal::right_open(start, end));
+		++intervals;
+	      }
+	    }
+	  }
+	}
+      }
+      chrFile.close();
+    }
+    return intervals;
+  }
+
+
+  // Keeps overlapping intervals
+  template<typename TRegionsGenome>
+  inline int32_t
+  _parsePotOverlappingIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) {
+    typedef typename TRegionsGenome::value_type TChrIntervals;
+
+    int32_t intervals = 0;
+    if (filePresent) {
+      bedRegions.resize(hdr->n_targets, TChrIntervals());
+      std::ifstream chrFile(filename.c_str(), std::ifstream::in);
+      if (chrFile.is_open()) {
+	while (chrFile.good()) {
+	  std::string chrFromFile;
+	  getline(chrFile, chrFromFile);
+	  typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
+	  boost::char_separator<char> sep(" \t,;");
+	  Tokenizer tokens(chrFromFile, sep);
+	  Tokenizer::iterator tokIter = tokens.begin();
+	  if (tokIter!=tokens.end()) {
+	    std::string chrName = *tokIter++;
+	    int32_t tid = bam_name2id(hdr, chrName.c_str());
+	    if (tid >= 0) {
+	      if (tokIter!=tokens.end()) {
+		int32_t start = boost::lexical_cast<int32_t>(*tokIter++);
+		int32_t end = boost::lexical_cast<int32_t>(*tokIter++);
+		bedRegions[tid].insert(std::make_pair(start, end));
+		++intervals;
+	      }
+	    }
+	  }
+	}
+      }
+      chrFile.close();
+    }
+    return intervals;
+  }
+
+
+  template<typename TChrIntervals>
+  inline void
+  _mergeOverlappingBedEntries(TChrIntervals const& bedRegions, TChrIntervals& citv) {
+    typedef boost::icl::interval_set<uint32_t> TUniqueIntervals;
+    typedef typename TUniqueIntervals::interval_type TIVal;
+    TUniqueIntervals uitv;
+
+    // Insert intervals
+    for(typename TChrIntervals::const_iterator it = bedRegions.begin(); it != bedRegions.end(); ++it) uitv.insert(TIVal::right_open(it->first, it->second));
+
+    // Fetch unique intervals
+    for(typename TUniqueIntervals::iterator it = uitv.begin(); it != uitv.end(); ++it) citv.insert(std::make_pair(it->lower(), it->upper()));
+  }
+}
+
+#endif