diff --git a/R/rd.R b/R/rd.R new file mode 100644 index 0000000..9cf7276 --- /dev/null +++ b/R/rd.R @@ -0,0 +1,43 @@ +library(ggplot2) +library(scales) +library(gtable) +library(grid) + +chrNamesLong = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21", "chr22", "chrX") +chrNamesShort = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X") + +args = commandArgs(trailingOnly=TRUE) +x = read.table(args[1], header=T) +maxCN = 8 + +# Fix chromosome ordering +if (sum(x$chr %in% chrNamesLong) > sum(x$chr %in% chrNamesShort)) { chrs = chrNamesLong; } else { chrs = chrNamesShort; } +x = x[x$chr %in% chrs,] +x$chr = factor(x$chr, levels=chrs) + +# Whole genome +p = ggplot(data=x, aes(x=start, y=x[,6])) +p = p + geom_point(pch=21, size=0.5) +p = p + xlab("Chromosome") +p = p + ylab("Copy-number") +p = p + scale_x_continuous(labels=comma) +p = p + facet_grid(. ~ chr, scales="free_x", space="free_x") +p = p + ylim(0, maxCN) +p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) +ggsave(p, file="plot.wholegenome.png", width=24, height=6) +print(warnings()) + +# By chromosome +for(chrname in unique(x$chr)) { + print(chrname) + sub = x[x$chr == chrname,] + p = ggplot(data=sub, aes(x=start, y=sub[,6])) + p = p + geom_point(pch=21, size=0.5) + p = p + ylab("Copy-number") + xlab(chrname) + p = p + scale_x_continuous(labels=comma, breaks = scales::pretty_breaks(n=20)) + p = p + ylim(0, maxCN) + p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) + ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6) + print(warnings()) +} + diff --git a/README.md b/README.md index b34ecc9..f48dd7b 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,24 @@ Germline SV calling `delly filter -f germline -o germline.bcf merged.bcf` + +Read-depth profiles +------------------- + +You can generate read-depth profiles with delly. This requires a mappability map which can be downloaded here: + +[Mappability Maps](https://gear.embl.de/data/delly/) + +The command to count reads in 10kbp windows and normalize the coverage is: + +`delly rd -a -g hg19.fa -m hg19.map input.bam` + +The output file can be plotted using R to generate normalized copy-number profiles: + +`Rscript R/rd.R out.cov.gz` + + + FAQ --- * What is the smallest SV size Delly can call? diff --git a/src/bed.h b/src/bed.h new file mode 100644 index 0000000..654ae4b --- /dev/null +++ b/src/bed.h @@ -0,0 +1,119 @@ +#ifndef BED_H +#define BED_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace torali +{ + + // Flattens overlapping intervals + template + inline int32_t + _parseBedIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) { + typedef typename TRegionsGenome::value_type TChrIntervals; + typedef typename TChrIntervals::interval_type TIVal; + + int32_t intervals = 0; + if (filePresent) { + bedRegions.resize(hdr->n_targets, TChrIntervals()); + std::ifstream chrFile(filename.c_str(), std::ifstream::in); + if (chrFile.is_open()) { + while (chrFile.good()) { + std::string chrFromFile; + getline(chrFile, chrFromFile); + typedef boost::tokenizer< boost::char_separator > Tokenizer; + boost::char_separator sep(" \t,;"); + Tokenizer tokens(chrFromFile, sep); + Tokenizer::iterator tokIter = tokens.begin(); + if (tokIter!=tokens.end()) { + std::string chrName = *tokIter++; + int32_t tid = bam_name2id(hdr, chrName.c_str()); + if (tid >= 0) { + if (tokIter!=tokens.end()) { + int32_t start = boost::lexical_cast(*tokIter++); + int32_t end = boost::lexical_cast(*tokIter++); + bedRegions[tid].insert(TIVal::right_open(start, end)); + ++intervals; + } + } + } + } + } + chrFile.close(); + } + return intervals; + } + + + // Keeps overlapping intervals + template + inline int32_t + _parsePotOverlappingIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) { + typedef typename TRegionsGenome::value_type TChrIntervals; + + int32_t intervals = 0; + if (filePresent) { + bedRegions.resize(hdr->n_targets, TChrIntervals()); + std::ifstream chrFile(filename.c_str(), std::ifstream::in); + if (chrFile.is_open()) { + while (chrFile.good()) { + std::string chrFromFile; + getline(chrFile, chrFromFile); + typedef boost::tokenizer< boost::char_separator > Tokenizer; + boost::char_separator sep(" \t,;"); + Tokenizer tokens(chrFromFile, sep); + Tokenizer::iterator tokIter = tokens.begin(); + if (tokIter!=tokens.end()) { + std::string chrName = *tokIter++; + int32_t tid = bam_name2id(hdr, chrName.c_str()); + if (tid >= 0) { + if (tokIter!=tokens.end()) { + int32_t start = boost::lexical_cast(*tokIter++); + int32_t end = boost::lexical_cast(*tokIter++); + bedRegions[tid].insert(std::make_pair(start, end)); + ++intervals; + } + } + } + } + } + chrFile.close(); + } + return intervals; + } + + + template + inline void + _mergeOverlappingBedEntries(TChrIntervals const& bedRegions, TChrIntervals& citv) { + typedef boost::icl::interval_set TUniqueIntervals; + typedef typename TUniqueIntervals::interval_type TIVal; + TUniqueIntervals uitv; + + // Insert intervals + for(typename TChrIntervals::const_iterator it = bedRegions.begin(); it != bedRegions.end(); ++it) uitv.insert(TIVal::right_open(it->first, it->second)); + + // Fetch unique intervals + for(typename TUniqueIntervals::iterator it = uitv.begin(); it != uitv.end(); ++it) citv.insert(std::make_pair(it->lower(), it->upper())); + } +} + +#endif diff --git a/src/coral.h b/src/coral.h new file mode 100644 index 0000000..8553e2f --- /dev/null +++ b/src/coral.h @@ -0,0 +1,648 @@ +#ifndef CORAL_H +#define CORAL_H + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bed.h" +#include "scan.h" +#include "gcbias.h" +#include "version.h" + +namespace torali +{ + + struct CountDNAConfig { + bool adaptive; + bool hasStatsFile; + bool hasBedFile; + bool hasScanFile; + bool noScanWindowSelection; + uint32_t nchr; + uint32_t meanisize; + uint32_t window_size; + uint32_t window_offset; + uint32_t scanWindow; + uint32_t minChrLen; + uint16_t minQual; + uint16_t mad; + uint16_t ploidy; + float exclgc; + float uniqueToTotalCovRatio; + float fracWindow; + float fragmentUnique; + float controlMaf; + std::string sampleName; + boost::filesystem::path outfile; + boost::filesystem::path genome; + boost::filesystem::path statsFile; + boost::filesystem::path mapFile; + boost::filesystem::path bamFile; + boost::filesystem::path bedFile; + boost::filesystem::path scanFile; + }; + + struct CountDNAConfigLib { + uint16_t madCutoff; + uint16_t madNormalCutoff; + boost::filesystem::path genome; + std::vector files; + }; + + template + inline int32_t + bamCount(TConfig const& c, LibraryInfo const& li, std::vector const& gcbias, std::pair const& gcbound) { + // Load bam file + samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); + hts_set_fai_filename(samfile, c.genome.string().c_str()); + hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); + bam_hdr_t* hdr = sam_hdr_read(samfile); + + // BED regions + typedef std::set > TChrIntervals; + typedef std::vector TRegionsGenome; + TRegionsGenome bedRegions; + if (c.hasBedFile) { + if (!_parsePotOverlappingIntervals(c.bedFile.string(), c.hasBedFile, hdr, bedRegions)) { + std::cerr << "Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; + return 1; + } + } + + // Parse BAM file + boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Count fragments" << std::endl; + boost::progress_display show_progress( hdr->n_targets ); + + // Open output files + boost::iostreams::filtering_ostream dataOut; + dataOut.push(boost::iostreams::gzip_compressor()); + dataOut.push(boost::iostreams::file_sink(c.outfile.c_str(), std::ios_base::out | std::ios_base::binary)); + dataOut << "chr\tstart\tend\t" << c.sampleName << "_mappable\t" << c.sampleName << "_counts\t" << c.sampleName << "_CN" << std::endl; + + // Iterate chromosomes + faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); + faidx_t* faiRef = fai_load(c.genome.string().c_str()); + for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { + ++show_progress; + if (chrNoData(c, refIndex, idx)) continue; + + // Check presence in mappability map + std::string tname(hdr->target_name[refIndex]); + int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); + if (seqlen == - 1) continue; + else seqlen = -1; + char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); + + // Check presence in reference + seqlen = faidx_seq_len(faiRef, tname.c_str()); + if (seqlen == - 1) continue; + else seqlen = -1; + char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen); + + // Get GC and Mappability + std::vector uniqContent(hdr->target_len[refIndex], 0); + std::vector gcContent(hdr->target_len[refIndex], 0); + { + // Mappability map + typedef boost::dynamic_bitset<> TBitSet; + TBitSet uniq(hdr->target_len[refIndex], false); + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if (seq[i] == 'C') uniq[i] = 1; + } + + // GC map + typedef boost::dynamic_bitset<> TBitSet; + TBitSet gcref(hdr->target_len[refIndex], false); + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1; + } + + // Sum across fragment + int32_t halfwin = (int32_t) (c.meanisize / 2); + int32_t usum = 0; + int32_t gcsum = 0; + for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { + if (pos == halfwin) { + for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) { + usum += uniq[i]; + gcsum += gcref[i]; + } + } else { + usum -= uniq[pos - halfwin - 1]; + gcsum -= gcref[pos - halfwin - 1]; + usum += uniq[pos + halfwin]; + gcsum += gcref[pos + halfwin]; + } + gcContent[pos] = gcsum; + uniqContent[pos] = usum; + } + } + + // Coverage track + typedef uint16_t TCount; + uint32_t maxCoverage = std::numeric_limits::max(); + typedef std::vector TCoverage; + TCoverage cov(hdr->target_len[refIndex], 0); + + { + // Mate map + typedef boost::unordered_map TMateMap; + TMateMap mateMap; + + // Count reads + hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); + bam1_t* rec = bam_init1(); + int32_t lastAlignedPos = 0; + std::set lastAlignedPosReads; + while (sam_itr_next(samfile, iter, rec) >= 0) { + if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue; + if (rec->core.qual < c.minQual) continue; + if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; + + int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); + if (rec->core.flag & BAM_FPAIRED) { + // Clean-up the read store for identical alignment positions + if (rec->core.pos > lastAlignedPos) { + lastAlignedPosReads.clear(); + lastAlignedPos = rec->core.pos; + } + + if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { + // First read + lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); + std::size_t hv = hash_pair(rec); + mateMap[hv] = true; + continue; + } else { + // Second read + std::size_t hv = hash_pair_mate(rec); + if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded + mateMap[hv] = false; + } + + // update midpoint + int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; + if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2); + } + + // Count fragment + if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint]; + } + // Clean-up + if (seq != NULL) free(seq); + if (ref != NULL) free(ref); + bam_destroy1(rec); + hts_itr_destroy(iter); + } + + // BED File (target intervals) + if (c.hasBedFile) { + if (c.adaptive) { + // Merge overlapping BED entries + TChrIntervals citv; + _mergeOverlappingBedEntries(bedRegions[refIndex], citv); + + // Tile merged intervals + double covsum = 0; + double expcov = 0; + double obsexp = 0; + uint32_t winlen = 0; + uint32_t start = 0; + bool endOfWindow = true; + typename TChrIntervals::iterator it = citv.begin(); + if (it != citv.end()) start = it->first; + while(endOfWindow) { + endOfWindow = false; + for(it = citv.begin(); ((it != citv.end()) && (!endOfWindow)); ++it) { + if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) { + if (start >= it->second) { + if (start == it->second) { + // Special case + typename TChrIntervals::iterator itNext = it; + ++itNext; + if (itNext != citv.end()) start = itNext->first; + } + continue; + } + for(uint32_t pos = it->first; ((pos < it->second) && (!endOfWindow)); ++pos) { + if (pos < start) continue; + if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { + covsum += cov[pos]; + obsexp += gcbias[gcContent[pos]].obsexp; + expcov += gcbias[gcContent[pos]].coverage; + ++winlen; + if (winlen == c.window_size) { + obsexp /= (double) winlen; + double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; + double cn = c.ploidy * covsum / expcov; + dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; + // reset + covsum = 0; + expcov = 0; + obsexp = 0; + winlen = 0; + if (c.window_offset == c.window_size) { + // Move on + start = pos + 1; + endOfWindow = true; + } else { + // Rewind + for(typename TChrIntervals::iterator sit = citv.begin(); ((sit != citv.end()) && (!endOfWindow)); ++sit) { + if ((sit->first < sit->second) && (sit->second <= hdr->target_len[refIndex])) { + if (start >= sit->second) continue; + for(uint32_t k = sit->first; ((k < sit->second) && (!endOfWindow)); ++k) { + if (k < start) continue; + if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) { + ++winlen; + if (winlen == c.window_offset) { + start = k + 1; + winlen = 0; + endOfWindow = true; + } + } + } + } + } + } + } + } + } + } + } + } + } else { + // Fixed Window Length + for(typename TChrIntervals::iterator it = bedRegions[refIndex].begin(); it != bedRegions[refIndex].end(); ++it) { + if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) { + double covsum = 0; + double expcov = 0; + double obsexp = 0; + uint32_t winlen = 0; + for(uint32_t pos = it->first; pos < it->second; ++pos) { + if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { + covsum += cov[pos]; + obsexp += gcbias[gcContent[pos]].obsexp; + expcov += gcbias[gcContent[pos]].coverage; + ++winlen; + } + } + if (winlen >= c.fracWindow * (it->second - it->first)) { + obsexp /= (double) winlen; + double count = ((double) covsum / obsexp ) * (double) (it->second - it->first) / (double) winlen; + double cn = c.ploidy * covsum / expcov; + dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; + } else { + dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\tNA\tNA\tNA" << std::endl; + } + } + } + } + } else { + if (c.adaptive) { + double covsum = 0; + double expcov = 0; + double obsexp = 0; + uint32_t winlen = 0; + uint32_t start = 0; + uint32_t pos = 0; + while(pos < hdr->target_len[refIndex]) { + if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { + covsum += cov[pos]; + obsexp += gcbias[gcContent[pos]].obsexp; + expcov += gcbias[gcContent[pos]].coverage; + ++winlen; + if (winlen == c.window_size) { + obsexp /= (double) winlen; + double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; + double cn = c.ploidy * covsum / expcov; + dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; + // reset + covsum = 0; + expcov = 0; + obsexp = 0; + winlen = 0; + if (c.window_offset == c.window_size) { + // Move on + start = pos + 1; + } else { + // Rewind + for(uint32_t k = start; k < hdr->target_len[refIndex]; ++k) { + if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) { + ++winlen; + if (winlen == c.window_offset) { + start = k + 1; + pos = k; + winlen = 0; + break; + } + } + } + } + } + } + ++pos; + } + } else { + // Fixed windows (genomic tiling) + for(uint32_t start = 0; start < hdr->target_len[refIndex]; start = start + c.window_offset) { + if (start + c.window_size < hdr->target_len[refIndex]) { + double covsum = 0; + double expcov = 0; + double obsexp = 0; + uint32_t winlen = 0; + for(uint32_t pos = start; pos < start + c.window_size; ++pos) { + if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { + covsum += cov[pos]; + obsexp += gcbias[gcContent[pos]].obsexp; + expcov += gcbias[gcContent[pos]].coverage; + ++winlen; + } + } + if (winlen >= c.fracWindow * c.window_size) { + obsexp /= (double) winlen; + double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; + double cn = c.ploidy * covsum / expcov; + dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (start + c.window_size) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; + } + } + } + } + } + } + + // clean-up + fai_destroy(faiRef); + fai_destroy(faiMap); + bam_hdr_destroy(hdr); + hts_idx_destroy(idx); + sam_close(samfile); + dataOut.pop(); + dataOut.pop(); + return 0; + } + + + int coral(int argc, char **argv) { + CountDNAConfig c; + + // Parameter + boost::program_options::options_description generic("Generic options"); + generic.add_options() + ("help,?", "show help message") + ("genome,g", boost::program_options::value(&c.genome), "genome file") + ("quality,q", boost::program_options::value(&c.minQual)->default_value(10), "min. mapping quality") + ("mappability,m", boost::program_options::value(&c.mapFile), "input mappability map") + ("ploidy,y", boost::program_options::value(&c.ploidy)->default_value(2), "baseline ploidy") + ("fragment,e", boost::program_options::value(&c.fragmentUnique)->default_value(0.97), "min. fragment uniqueness [0,1]") + ("statsfile,s", boost::program_options::value(&c.statsFile), "gzipped stats output file (optional)") + ("outfile,o", boost::program_options::value(&c.outfile)->default_value("out.cov.gz"), "output file") + ("adaptive-windowing,a", "use mappable bases for window size") + ; + + boost::program_options::options_description window("Window options"); + window.add_options() + ("window-size,i", boost::program_options::value(&c.window_size)->default_value(10000), "window size") + ("window-offset,j", boost::program_options::value(&c.window_offset)->default_value(10000), "window offset") + ("bed-intervals,b", boost::program_options::value(&c.bedFile), "input BED file") + ("fraction-window,k", boost::program_options::value(&c.fracWindow)->default_value(0.25), "min. callable window fraction [0,1]") + ; + + boost::program_options::options_description gcopt("GC options"); + gcopt.add_options() + ("scan-window,w", boost::program_options::value(&c.scanWindow)->default_value(10000), "scanning window size") + ("fraction-unique,f", boost::program_options::value(&c.uniqueToTotalCovRatio)->default_value(0.8), "uniqueness filter for scan windows [0,1]") + ("scan-regions,r", boost::program_options::value(&c.scanFile), "scanning regions in BED format") + ("mad-cutoff,d", boost::program_options::value(&c.mad)->default_value(3), "median + 3 * mad count cutoff") + ("percentile,p", boost::program_options::value(&c.exclgc)->default_value(0.0005), "excl. extreme GC fraction") + ("no-window-selection,n", "no scan window selection") + ; + + boost::program_options::options_description hidden("Hidden options"); + hidden.add_options() + ("input-file", boost::program_options::value(&c.bamFile), "input bam file") + ; + + boost::program_options::positional_options_description pos_args; + pos_args.add("input-file", -1); + + // Set the visibility + boost::program_options::options_description cmdline_options; + cmdline_options.add(generic).add(window).add(gcopt).add(hidden); + boost::program_options::options_description visible_options; + visible_options.add(generic).add(window).add(gcopt); + + // Parse command-line + boost::program_options::variables_map vm; + boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); + boost::program_options::notify(vm); + + // Check command line arguments + if ((vm.count("help")) || (!vm.count("input-file")) || (!vm.count("genome")) || (!vm.count("mappability"))) { + std::cout << std::endl; + std::cout << "Usage: delly " << argv[0] << " [OPTIONS] -g -m " << std::endl; + std::cout << visible_options << "\n"; + return 1; + } + + // Show cmd + boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; + std::cout << "delly "; + for(int i=0; i c.window_size) c.window_offset = c.window_size; + if (c.window_size == 0) c.window_size = 1; + if (c.window_offset == 0) c.window_offset = 1; + + // Check bam file + LibraryInfo li; + if (!(boost::filesystem::exists(c.bamFile) && boost::filesystem::is_regular_file(c.bamFile) && boost::filesystem::file_size(c.bamFile))) { + std::cerr << "Alignment file is missing: " << c.bamFile.string() << std::endl; + return 1; + } else { + // Get scan regions + typedef boost::icl::interval_set TChrIntervals; + typedef typename TChrIntervals::interval_type TIVal; + typedef std::vector TRegionsGenome; + TRegionsGenome scanRegions; + + // Open BAM file + samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); + if (samfile == NULL) { + std::cerr << "Fail to open file " << c.bamFile.string() << std::endl; + return 1; + } + hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); + if (idx == NULL) { + if (bam_index_build(c.bamFile.string().c_str(), 0) != 0) { + std::cerr << "Fail to open index for " << c.bamFile.string() << std::endl; + return 1; + } + } + bam_hdr_t* hdr = sam_hdr_read(samfile); + if (hdr == NULL) { + std::cerr << "Fail to open header for " << c.bamFile.string() << std::endl; + return 1; + } + c.nchr = hdr->n_targets; + c.minChrLen = setMinChrLen(hdr, 0.95); + std::string sampleName = "unknown"; + getSMTag(std::string(hdr->text), c.bamFile.stem().string(), sampleName); + c.sampleName = sampleName; + + // Check matching chromosome names + faidx_t* faiRef = fai_load(c.genome.string().c_str()); + faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); + uint32_t mapFound = 0; + uint32_t refFound = 0; + for(int32_t refIndex=0; refIndex < hdr->n_targets; ++refIndex) { + std::string tname(hdr->target_name[refIndex]); + if (faidx_has_seq(faiMap, tname.c_str())) ++mapFound; + if (faidx_has_seq(faiRef, tname.c_str())) ++refFound; + else { + std::cerr << "Warning: BAM chromosome " << tname << " not present in reference genome!" << std::endl; + } + } + fai_destroy(faiRef); + fai_destroy(faiMap); + if (!mapFound) { + std::cerr << "Mappability map chromosome naming disagrees with BAM file!" << std::endl; + return 1; + } + if (!refFound) { + std::cerr << "Reference genome chromosome naming disagrees with BAM file!" << std::endl; + return 1; + } + + // Estimate library params + if (c.hasScanFile) { + if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) { + std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; + return 1; + } + } else { + scanRegions.resize(hdr->n_targets); + for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { + scanRegions[refIndex].insert(TIVal::right_open(0, hdr->target_len[refIndex])); + } + } + typedef std::vector TSampleLibrary; + TSampleLibrary sampleLib(1, LibraryInfo()); + CountDNAConfigLib dellyConf; + dellyConf.genome = c.genome; + dellyConf.files.push_back(c.bamFile); + dellyConf.madCutoff = 9; + dellyConf.madNormalCutoff = c.mad; + getLibraryParams(dellyConf, scanRegions, sampleLib); + li = sampleLib[0]; + if (!li.median) { + li.median = 250; + li.mad = 15; + li.minNormalISize = 0; + li.maxNormalISize = 400; + } + c.meanisize = ((int32_t) (li.median / 2)) * 2 + 1; + + // Clean-up + bam_hdr_destroy(hdr); + hts_idx_destroy(idx); + sam_close(samfile); + } + + // GC bias estimation + typedef std::pair TGCBound; + TGCBound gcbound; + std::vector gcbias(c.meanisize + 1, GcBias()); + { + // Scan genomic windows + typedef std::vector TWindowCounts; + typedef std::vector TGenomicWindowCounts; + TGenomicWindowCounts scanCounts(c.nchr, TWindowCounts()); + scan(c, li, scanCounts); + + // Select stable windows + selectWindows(c, scanCounts); + + // Estimate GC bias + gcBias(c, scanCounts, li, gcbias, gcbound); + + // Statistics output + if (c.hasStatsFile) { + // Open stats file + boost::iostreams::filtering_ostream statsOut; + statsOut.push(boost::iostreams::gzip_compressor()); + statsOut.push(boost::iostreams::file_sink(c.statsFile.string().c_str(), std::ios_base::out | std::ios_base::binary)); + + // Library Info + statsOut << "LP\t" << li.rs << ',' << li.median << ',' << li.mad << ',' << li.minNormalISize << ',' << li.maxNormalISize << std::endl; + + // Scan window summry + samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); + bam_hdr_t* hdr = sam_hdr_read(samfile); + statsOut << "SW\tchrom\tstart\tend\tselected\tcoverage\tuniqcov" << std::endl; + for(uint32_t refIndex = 0; refIndex < (uint32_t) hdr->n_targets; ++refIndex) { + for(uint32_t i = 0; i < scanCounts[refIndex].size(); ++i) { + statsOut << "SW\t" << hdr->target_name[refIndex] << '\t' << scanCounts[refIndex][i].start << '\t' << scanCounts[refIndex][i].end << '\t' << scanCounts[refIndex][i].select << '\t' << scanCounts[refIndex][i].cov << '\t' << scanCounts[refIndex][i].uniqcov << std::endl; + } + } + bam_hdr_destroy(hdr); + sam_close(samfile); + + // GC bias summary + statsOut << "GC\tgcsum\tsample\treference\tpercentileSample\tpercentileReference\tfractionSample\tfractionReference\tobsexp\tmeancoverage" << std::endl; + for(uint32_t i = 0; i < gcbias.size(); ++i) statsOut << "GC\t" << i << "\t" << gcbias[i].sample << "\t" << gcbias[i].reference << "\t" << gcbias[i].percentileSample << "\t" << gcbias[i].percentileReference << "\t" << gcbias[i].fractionSample << "\t" << gcbias[i].fractionReference << "\t" << gcbias[i].obsexp << "\t" << gcbias[i].coverage << std::endl; + statsOut << "BoundsGC\t" << gcbound.first << "," << gcbound.second << std::endl; + statsOut.pop(); + statsOut.pop(); + } + } + + // Count reads + if (bamCount(c, li, gcbias, gcbound)) { + std::cerr << "Read counting error!" << std::endl; + return 1; + } + + // Done + now = boost::posix_time::second_clock::local_time(); + std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Done." << std::endl; + + return 0; + } + + +} + +#endif diff --git a/src/delly.cpp b/src/delly.cpp index 434c9cf..3c693df 100644 --- a/src/delly.cpp +++ b/src/delly.cpp @@ -18,6 +18,7 @@ #include "filter.h" #include "merge.h" #include "tegua.h" +#include "coral.h" using namespace torali; @@ -26,13 +27,17 @@ inline void displayUsage() { std::cout << "Usage: delly " << std::endl; std::cout << std::endl; - std::cout << "Commands:" << std::endl; - std::cout << std::endl; + std::cout << "Short-read commands:" << std::endl; std::cout << " call discover and genotype structural variants" << std::endl; - std::cout << " lr long-read SV discovery" << std::endl; std::cout << " merge merge structural variants across VCF/BCF files and within a single VCF/BCF file" << std::endl; std::cout << " filter filter somatic or germline structural variants" << std::endl; std::cout << std::endl; + std::cout << "Long-read commands:" << std::endl; + std::cout << " lr long-read SV discovery" << std::endl; + std::cout << std::endl; + std::cout << "Read-depth commands:" << std::endl; + std::cout << " rd read-depth normalization" << std::endl; + std::cout << std::endl; std::cout << std::endl; } @@ -68,6 +73,9 @@ int main(int argc, char **argv) { else if ((std::string(argv[1]) == "lr")) { return tegua(argc-1,argv+1); } + else if ((std::string(argv[1]) == "rd")) { + return coral(argc-1,argv+1); + } else if ((std::string(argv[1]) == "filter")) { return filter(argc-1,argv+1); } diff --git a/src/delly.h b/src/delly.h index dcc5106..f62d612 100644 --- a/src/delly.h +++ b/src/delly.h @@ -54,6 +54,7 @@ namespace torali uint16_t minTraQual; uint16_t minGenoQual; uint16_t madCutoff; + uint16_t madNormalCutoff; int32_t nchr; int32_t minimumFlankSize; int32_t indelsize; @@ -196,6 +197,7 @@ namespace torali int delly(int argc, char **argv) { Config c; c.isHaplotagged = false; + c.madNormalCutoff = 5; // Define generic options std::string svtype; diff --git a/src/gcbias.h b/src/gcbias.h new file mode 100644 index 0000000..0981210 --- /dev/null +++ b/src/gcbias.h @@ -0,0 +1,343 @@ +#ifndef GCBIAS_H +#define GCBIAS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "scan.h" +#include "util.h" + +namespace torali +{ + struct GcBias { + int32_t sample; + int32_t reference; + double fractionSample; + double fractionReference; + double percentileSample; + double percentileReference; + double obsexp; + double coverage; + + GcBias() : sample(0), reference(0), fractionSample(0), fractionReference(0), percentileSample(0), percentileReference(0), obsexp(0), coverage(0) {} + }; + + template + inline std::pair + gcBound(TConfig const& c, std::vector& gcbias) { + uint32_t lowerBound = 0; + uint32_t upperBound = gcbias.size(); + for(uint32_t i = 0; i < gcbias.size(); ++i) { + if ((gcbias[i].percentileSample < c.exclgc) || (gcbias[i].percentileReference < c.exclgc)) lowerBound = i; + if ((gcbias[i].percentileSample + c.exclgc > 1) || (gcbias[i].percentileReference + c.exclgc > 1)) { + if (i < upperBound) upperBound = i; + } + } + if (lowerBound >= upperBound) upperBound = lowerBound + 1; + /* + // Adjust total + uint64_t totalSampleCount = 0; + uint64_t totalReferenceCount = 0; + for(uint32_t i = lowerBound + 1; i < upperBound; ++i) { + totalSampleCount += gcbias[i].sample; + totalReferenceCount += gcbias[i].reference; + } + // Re-estimate observed/expected + for(uint32_t i = lowerBound + 1; i < upperBound; ++i) { + gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; + gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; + gcbias[i].obsexp = 1; + if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; + } + */ + + return std::make_pair(lowerBound, upperBound); + } + + + inline double + getPercentIdentity(bam1_t const* rec, char const* seq) { + // Sequence + std::string sequence; + sequence.resize(rec->core.l_qseq); + uint8_t* seqptr = bam_get_seq(rec); + for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; + + // Reference slice + std::string refslice = boost::to_upper_copy(std::string(seq + rec->core.pos, seq + lastAlignedPosition(rec))); + + // Percent identity + uint32_t rp = 0; // reference pointer + uint32_t sp = 0; // sequence pointer + uint32_t* cigar = bam_get_cigar(rec); + int32_t matchCount = 0; + int32_t mismatchCount = 0; + for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { + if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { + // match or mismatch + for(std::size_t k = 0; k 0) percid = (double) matchCount / (double) (matchCount + mismatchCount); + return percid; + } + + + + template + inline void + gcBias(TConfig const& c, std::vector< std::vector > const& scanCounts, LibraryInfo const& li, std::vector& gcbias, TGCBound& gcbound) { + // Load bam file + samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); + hts_set_fai_filename(samfile, c.genome.string().c_str()); + hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); + bam_hdr_t* hdr = sam_hdr_read(samfile); + + // Parse bam (contig by contig) + boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Estimate GC bias" << std::endl; + boost::progress_display show_progress( hdr->n_targets ); + + faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); + faidx_t* faiRef = fai_load(c.genome.string().c_str()); + for (int refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { + ++show_progress; + if (scanCounts[refIndex].empty()) continue; + + // Bin map + std::vector binMap; + if (c.hasScanFile) { + // Fill bin map + binMap.resize(hdr->target_len[refIndex], LAST_BIN); + for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) { + for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin; + } + } + + // Check presence in mappability map + std::string tname(hdr->target_name[refIndex]); + int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); + if (seqlen == - 1) continue; + else seqlen = -1; + char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); + + // Check presence in reference + seqlen = faidx_seq_len(faiRef, tname.c_str()); + if (seqlen == - 1) continue; + else seqlen = -1; + char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen); + + // Get GC and Mappability + std::vector uniqContent(hdr->target_len[refIndex], 0); + std::vector gcContent(hdr->target_len[refIndex], 0); + { + // Mappability map + typedef boost::dynamic_bitset<> TBitSet; + TBitSet uniq(hdr->target_len[refIndex], false); + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if (seq[i] == 'C') uniq[i] = true; + } + + // GC map + typedef boost::dynamic_bitset<> TBitSet; + TBitSet gcref(hdr->target_len[refIndex], false); + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1; + } + + // Sum across fragments + int32_t halfwin = (int32_t) (c.meanisize / 2); + int32_t usum = 0; + int32_t gcsum = 0; + for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { + if (pos == halfwin) { + for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) { + usum += uniq[i]; + gcsum += gcref[i]; + } + } else { + usum -= uniq[pos - halfwin - 1]; + gcsum -= gcref[pos - halfwin - 1]; + usum += uniq[pos + halfwin]; + gcsum += gcref[pos + halfwin]; + } + gcContent[pos] = gcsum; + uniqContent[pos] = usum; + } + } + + // Coverage track + typedef uint16_t TCount; + uint32_t maxCoverage = std::numeric_limits::max(); + typedef std::vector TCoverage; + TCoverage cov(hdr->target_len[refIndex], 0); + + // Mate map + typedef boost::unordered_map TMateMap; + TMateMap mateMap; + + // Parse BAM + hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); + bam1_t* rec = bam_init1(); + int32_t lastAlignedPos = 0; + std::set lastAlignedPosReads; + while (sam_itr_next(samfile, iter, rec) >= 0) { + if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; + if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; + if (rec->core.qual < c.minQual) continue; + + int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); + if (rec->core.flag & BAM_FPAIRED) { + // Clean-up the read store for identical alignment positions + if (rec->core.pos > lastAlignedPos) { + lastAlignedPosReads.clear(); + lastAlignedPos = rec->core.pos; + } + + // Process pair + if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { + // First read + lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); + std::size_t hv = hash_pair(rec); + mateMap[hv]= true; + continue; + } else { + // Second read + std::size_t hv = hash_pair_mate(rec); + if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded + mateMap[hv] = false; + } + + // Insert size filter + int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; + if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) { + midPoint = rec->core.mpos + (int32_t) (isize/2); + } else { + if (rec->core.flag & BAM_FREVERSE) midPoint = rec->core.pos + alignmentLength(rec) - (c.meanisize / 2); + else midPoint = rec->core.pos + (c.meanisize / 2); + } + } + + // Count fragment + if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint]; + } + bam_destroy1(rec); + hts_itr_destroy(iter); + if (seq != NULL) free(seq); + if (ref != NULL) free(ref); + + // Summarize GC coverage for this chromosome + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if (uniqContent[i] >= c.fragmentUnique * c.meanisize) { + // Valid bin? + int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, i); + if ((bin >= 0) && (scanCounts[refIndex][bin].select)) { + ++gcbias[gcContent[i]].reference; + gcbias[gcContent[i]].sample += cov[i]; + gcbias[gcContent[i]].coverage += cov[i]; + } + } + } + } + + // Normalize GC coverage + for(uint32_t i = 0; i < gcbias.size(); ++i) { + if (gcbias[i].reference) gcbias[i].coverage /= (double) gcbias[i].reference; + else gcbias[i].coverage = 0; + } + + // Determine percentiles + uint64_t totalSampleCount = 0; + uint64_t totalReferenceCount = 0; + for(uint32_t i = 0; i < gcbias.size(); ++i) { + totalSampleCount += gcbias[i].sample; + totalReferenceCount += gcbias[i].reference; + } + uint64_t cumSample = 0; + uint64_t cumReference = 0; + for(uint32_t i = 0; i < gcbias.size(); ++i) { + cumSample += gcbias[i].sample; + cumReference += gcbias[i].reference; + gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; + gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; + gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount; + gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount; + gcbias[i].obsexp = 1; + if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; + } + + // Estimate correctable GC range + gcbound = gcBound(c, gcbias); + + // Adjust correction to the callable range + totalSampleCount = 0; + totalReferenceCount = 0; + for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) { + totalSampleCount += gcbias[i].sample; + totalReferenceCount += gcbias[i].reference; + } + cumSample = 0; + cumReference = 0; + // Re-initialize + for(uint32_t i = 0; i < gcbias.size(); ++i) { + gcbias[i].fractionSample = 0; + gcbias[i].fractionReference = 0; + gcbias[i].percentileSample = 0; + gcbias[i].percentileReference = 0; + gcbias[i].obsexp = 1; + } + for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) { + cumSample += gcbias[i].sample; + cumReference += gcbias[i].reference; + gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; + gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; + gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount; + gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount; + gcbias[i].obsexp = 1; + if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; + } + + fai_destroy(faiRef); + fai_destroy(faiMap); + hts_idx_destroy(idx); + sam_close(samfile); + bam_hdr_destroy(hdr); + } + +} + +#endif diff --git a/src/scan.h b/src/scan.h new file mode 100644 index 0000000..2e96514 --- /dev/null +++ b/src/scan.h @@ -0,0 +1,298 @@ +#ifndef SCAN_H +#define SCAN_H + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "version.h" +#include "util.h" + + +namespace torali +{ + + struct ScanWindow { + bool select; + int32_t start; + int32_t end; + uint32_t cov; + uint32_t uniqcov; + + ScanWindow() : select(false), start(0), end(0), cov(0), uniqcov(0) {} + explicit ScanWindow(int32_t const s) : select(false), start(s), end(s+1), cov(0), uniqcov(0) {} + }; + + template + struct SortScanWindow : public std::binary_function + { + inline bool operator()(TScanWindow const& sw1, TScanWindow const& sw2) { + return ((sw1.start + inline int32_t + _findScanWindow(TConfig const& c, uint32_t const reflen, std::vector const& binMap, int32_t const midPoint) { + if (c.hasScanFile) { + if (binMap[midPoint] == LAST_BIN) return -1; + else return binMap[midPoint]; + } else { + uint32_t bin = midPoint / c.scanWindow; + uint32_t allbins = reflen / c.scanWindow; + if (bin >= allbins) return -1; + else return bin; + } + return -1; + } + + + inline std::pair + estCountBounds(std::vector< std::vector > const& scanCounts) { + std::vector all; + for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { + for(uint32_t i = 0; i absdev; + for(uint32_t i = 0; i + inline void + scan(TConfig const& c, LibraryInfo const& li, std::vector< std::vector >& scanCounts) { + + // Load bam file + samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); + hts_set_fai_filename(samfile, c.genome.string().c_str()); + hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); + bam_hdr_t* hdr = sam_hdr_read(samfile); + + // Pre-defined scanning windows + if (c.hasScanFile) { + typedef boost::icl::interval_set TChrIntervals; + typedef std::vector TRegionsGenome; + TRegionsGenome scanRegions; + if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) { + std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; + } + for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { + for(typename TChrIntervals::iterator it = scanRegions[refIndex].begin(); it != scanRegions[refIndex].end(); ++it) { + if (it->lower() < it->upper()) { + if ((it->lower() >= 0) && (it->upper() < hdr->target_len[refIndex])) { + ScanWindow sw; + sw.start = it->lower(); + sw.end = it->upper(); + sw.select = true; + scanCounts[refIndex].push_back(sw); + } + } + } + // Sort scan windows + sort(scanCounts[refIndex].begin(), scanCounts[refIndex].end(), SortScanWindow()); + } + } + + // Parse BAM file + boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Scanning Windows" << std::endl; + boost::progress_display show_progress( hdr->n_targets ); + + // Iterate chromosomes + uint64_t totalCov = 0; + faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); + for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { + ++show_progress; + if (chrNoData(c, refIndex, idx)) continue; + // Exclude small chromosomes + if ((hdr->target_len[refIndex] < c.minChrLen) && (totalCov > 1000000)) continue; + // Exclude sex chromosomes + if ((std::string(hdr->target_name[refIndex]) == "chrX") || (std::string(hdr->target_name[refIndex]) == "chrY") || (std::string(hdr->target_name[refIndex]) == "X") || (std::string(hdr->target_name[refIndex]) == "Y")) continue; + + // Check presence in mappability map + std::string tname(hdr->target_name[refIndex]); + int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); + if (seqlen == -1) continue; + else seqlen = -1; + char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); + + // Get Mappability + std::vector uniqContent(hdr->target_len[refIndex], 0); + { + // Mappability map + typedef boost::dynamic_bitset<> TBitSet; + TBitSet uniq(hdr->target_len[refIndex], false); + for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { + if (seq[i] == 'C') uniq[i] = 1; + } + + // Sum across fragments + int32_t halfwin = (int32_t) (c.meanisize / 2); + int32_t usum = 0; + for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { + if (pos == halfwin) { + for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) usum += uniq[i]; + } else { + usum -= uniq[pos - halfwin - 1]; + usum += uniq[pos + halfwin]; + } + uniqContent[pos] = usum; + } + } + + // Bins on this chromosome + std::vector binMap; + if (!c.hasScanFile) { + uint32_t allbins = hdr->target_len[refIndex] / c.scanWindow; + scanCounts[refIndex].resize(allbins, ScanWindow()); + for(uint32_t i = 0; i < allbins; ++i) { + scanCounts[refIndex][i].start = i * c.scanWindow; + scanCounts[refIndex][i].end = (i+1) * c.scanWindow; + } + } else { + // Fill bin map + binMap.resize(hdr->target_len[refIndex], LAST_BIN); + if (scanCounts[refIndex].size() >= LAST_BIN) { + std::cerr << "Warning: Too many scan windows on " << hdr->target_name[refIndex] << std::endl; + } + for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) { + for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin; + } + } + + // Mate map + typedef boost::unordered_map TMateMap; + TMateMap mateMap; + + // Count reads + hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); + bam1_t* rec = bam_init1(); + int32_t lastAlignedPos = 0; + std::set lastAlignedPosReads; + while (sam_itr_next(samfile, iter, rec) >= 0) { + if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; + if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; + if (rec->core.qual < c.minQual) continue; + if (getSVType(rec->core) != 2) continue; + + int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); + if (rec->core.flag & BAM_FPAIRED) { + // Clean-up the read store for identical alignment positions + if (rec->core.pos > lastAlignedPos) { + lastAlignedPosReads.clear(); + lastAlignedPos = rec->core.pos; + } + + if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { + // First read + lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); + std::size_t hv = hash_pair(rec); + mateMap[hv] = true; + continue; + } else { + // Second read + std::size_t hv = hash_pair_mate(rec); + if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded + mateMap[hv] = false; + } + + // Insert size filter + int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; + if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2); + else continue; + } + + // Count fragment + if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex])) { + int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, midPoint); + if (bin >= 0) { + ++scanCounts[refIndex][bin].cov; + + if (uniqContent[midPoint] >= c.fragmentUnique * c.meanisize) ++scanCounts[refIndex][bin].uniqcov; + ++totalCov; + } + } + } + // Clean-up + bam_destroy1(rec); + hts_itr_destroy(iter); + if (seq != NULL) free(seq); + } + + // clean-up + fai_destroy(faiMap); + bam_hdr_destroy(hdr); + hts_idx_destroy(idx); + sam_close(samfile); + } + + + template + inline void + selectWindows(TConfig const& c, std::vector< std::vector >& scanCounts) { + if (c.noScanWindowSelection) { + // Select all windows + for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { + for(uint32_t i = 0; i 0) uniqratio = (double) scanCounts[refIndex][i].uniqcov / scanCounts[refIndex][i].cov; + if (uniqratio > c.uniqueToTotalCovRatio) scanCounts[refIndex][i].select = true; + else scanCounts[refIndex][i].select = false; + } + } + + // Normalize user-defined scan windows to same length (10,000bp) + if (c.hasScanFile) { + for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { + for(uint32_t i = 0; i TCountBounds; + TCountBounds cb = estCountBounds(scanCounts); + + // Select CN2 windows + for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { + for(uint32_t i = 0; i cb.first) && (scanCounts[refIndex][i].cov < cb.second)) scanCounts[refIndex][i].select = true; + else scanCounts[refIndex][i].select = false; + } + } + } + } + } + +} + +#endif diff --git a/src/util.h b/src/util.h index 689a26c..7cc78fd 100644 --- a/src/util.h +++ b/src/util.h @@ -16,6 +16,10 @@ namespace torali { + #ifndef LAST_BIN + #define LAST_BIN 65535 + #endif + struct LibraryInfo { int32_t rs; int32_t median; @@ -41,7 +45,13 @@ namespace torali }; - + inline bool + nContent(std::string const& s) { + for(uint32_t i = 0; i < s.size(); ++i) { + if ((s[i] == 'N') || (s[i] == 'n')) return true; + } + return false; + } // Decode Orientation inline int32_t @@ -199,7 +209,7 @@ namespace torali return sequenceLength(rec); } - inline uint32_t alignmentLength(bam1_t* rec) { + inline uint32_t alignmentLength(bam1_t const* rec) { uint32_t* cigar = bam_get_cigar(rec); uint32_t alen = 0; for (uint32_t i = 0; i < rec->core.n_cigar; ++i) @@ -207,10 +217,17 @@ namespace torali return alen; } - inline uint32_t halfAlignmentLength(bam1_t* rec) { + inline uint32_t halfAlignmentLength(bam1_t const* rec) { return (alignmentLength(rec) / 2); } + inline uint32_t + lastAlignedPosition(bam1_t const* rec) { + return rec->core.pos + alignmentLength(rec); + } + + + inline std::size_t hash_lr(bam1_t* rec) { boost::hash string_hash; std::string qname = bam_get_qname(rec); @@ -293,6 +310,47 @@ namespace torali } + inline uint32_t + setMinChrLen(bam_hdr_t const* hdr, double const xx) { + uint32_t minChrLen = 0; + std::vector chrlen(hdr->n_targets, 0); + uint64_t genomelen = 0; + for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { + chrlen[refIndex] = hdr->target_len[refIndex]; + genomelen += hdr->target_len[refIndex]; + } + std::sort(chrlen.begin(), chrlen.end(), std::greater()); + uint64_t cumsum = 0; + for(uint32_t i = 0; i < chrlen.size(); ++i) { + cumsum += chrlen[i]; + minChrLen = chrlen[i]; + if (cumsum > genomelen * xx) break; + } + return minChrLen; + } + + + template + inline bool + chrNoData(TConfig const& c, uint32_t const refIndex, hts_idx_t const* idx) { + // Check we have mapped reads on this chromosome + std::string suffix("cram"); + std::string str(c.bamFile.string()); + if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) return false; + uint64_t mapped = 0; + uint64_t unmapped = 0; + hts_idx_get_stat(idx, refIndex, &mapped, &unmapped); + if (mapped) return false; + else return true; + } + + inline std::size_t hash_se(bam1_t* rec) { + std::size_t seed = hash_string(bam_get_qname(rec)); + boost::hash_combine(seed, rec->core.tid); + boost::hash_combine(seed, rec->core.pos); + return seed; + } + inline void getSMTag(std::string const& header, std::string const& fileName, std::string& sampleName) { std::set smIdentifiers; @@ -564,8 +622,8 @@ namespace torali } else { sampleLib[file_c].median = median; sampleLib[file_c].mad = mad; - sampleLib[file_c].maxNormalISize = median + (5 * mad); - sampleLib[file_c].minNormalISize = median - (5 * mad); + sampleLib[file_c].maxNormalISize = median + (c.madNormalCutoff * mad); + sampleLib[file_c].minNormalISize = median - (c.madNormalCutoff * mad); if (sampleLib[file_c].minNormalISize < 0) sampleLib[file_c].minNormalISize=0; sampleLib[file_c].maxISizeCutoff = median + (c.madCutoff * mad); sampleLib[file_c].minISizeCutoff = median - (c.madCutoff * mad);