Download 1000G phase1 mtDNAs result and R script used for extracting variants from VCF file into MitoTool/MitoToolPy input file.


Notes: This demo including R script is available for both rCRS and RSRS as human mtDNAs reference. And it should be noticed that the reference mtDNA (rCRS or RSRS) used for reads mapping should be consistent with haplogrouping system of MitoTool (i.e., mtDNA tree based on rCRS or RSRS).


step 1: Install dependant R package (Tips: skip this step when this package has been installed)
source("https://bioconductor.org/biocLite.R")
biocLite("VariantAnnotation")


step 2: Download 1000G phase1 mtDNA VCF file and its tbi file (Tips: skip this step when using local VCF and tbi files)
# rCRS is reference mtDNA during reads mapping and hg19 is the version of human reference genome in 1000 Genome Project.
url="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chrMT.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz"
destFile=paste(getwd(),"/1000G_mtDNA.vcf.gz",sep="")
download.file(url, destFile)

tbiUrl="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chrMT.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz.tbi"
tbiFile=paste(getwd(),"/1000G_mtDNA.vcf.gz.tbi",sep="")
download.file(tbiUrl, tbiFile)


step 3: Extract 1000G phase1 mtDNA variants from VCF file and export variants into a fasta-like file as MitoTool input file (Tips: for analyzing local VCF file, just replace destFile below to the path of your VCF file, e.g., “/home/john/sample1.vcf.gz”)
library(VariantAnnotation)

# Use bgzip and tabix of samtools to preprocess your vcf file. 
# bgzip -c <your_file_name>.vcf > <your_file_name>.vcf.gz
# tabix -p vcf <your_file_name>.vcf.gz

# To modify "destFile" to set the path of your own local VCF files which are generated by other software such as GATK, samtools and VcfTools, e.g., "/home/john/sample1.vcf.gz". The corresponding tbi file (i.e., /home/john/sample1.vcf.gz.tbi) is required and tabix of samtools could be used to generate tbi file. 
vcfFile = destFile

# "MT" is the name of reference mtDNA used for reads mapping, so in your VCF file, it may be "MT", "M", "rCRS", "RSRS" and others, please revise it according to your own reference mtDNA and VCF file. Meanwhile, 16569 is the length of rCRS, it should be the same as the length of your own reference mtDNA.
param = ScanVcfParam(geno="GT", which=GRanges("MT", IRanges(1, 16569)))
vcf = readVcf(vcfFile, "hg19", param=param)
sample_id=colnames(geno(vcf)$GT)

# Users can modify this R script for filtering some low-quality variants using score and/or pass.
score=fixed(vcf)$QUAL
pass=fixed(vcf)$FILTER

function1 <- function(i,alt,current_variants, ref, position){
  if(current_variants[i]!="."){
    current_alt=as.character(alt[[i]][strtoi(current_variants[i])])
    current_ref=as.character(ref[i])
    current_position=position[i]
    if( nchar(current_ref)>1 & nchar(current_alt)==nchar(current_ref) ){
      current_alt=strtrim(current_alt,1)
      current_ref=strtrim(current_ref,1)
    }
    if(current_ref=="A" & current_alt=="G"){current_ref=""; current_alt=""}
    else if(current_ref=="G" & current_alt=="A"){current_ref=""; current_alt=""}
    else if(current_ref=="C" & current_alt=="T"){current_ref=""; current_alt=""}
    else if(current_ref=="T" & current_alt=="C"){current_ref=""; current_alt=""}
    else if(current_ref=="A" & (current_alt=="C" | current_alt== "T")){current_ref=""; }
    else if(current_ref=="G" & (current_alt=="C" | current_alt== "T")){current_ref=""; }
    else if(current_ref=="C" & (current_alt=="A" | current_alt== "G")){current_ref=""; }
    else if(current_ref=="T" & (current_alt=="A" | current_alt== "G")){current_ref=""; }
    else{
      if(nchar(current_alt)>nchar(current_ref)){
        current_alt=paste("+", gsub(paste(current_ref,"$",sep=""),"",current_alt),sep="")
        current_ref=""
        }
      else if(nchar(current_alt)<nchar(current_ref)){
        d=nchar(current_ref)-nchar(current_alt)
        current_ref=""
        current_position=as.character(strtoi(current_position)+1) 
        if(d==1){current_alt="d"}else{current_alt=paste(c("-",as.character(strtoi(current_position)+d-1) ,"d"), sep="",collapse="")}
      }else{}
    }
    return( paste(c(current_ref,current_position,current_alt),sep="",collapse="") )
  }else{
    return("#")
  }
}

function2<-function(i,vcf){
  cat("Processing Sample No.", i, "\n")
  current=geno(vcf)$GT[,i]
  names(current)=NULL
  index=which(current!="0")
  current_variants=gsub("0/","",current[which(current!="0")])
  ref=fixed(vcf)$REF[index]
  position=data.frame(ranges(vcf))$start[index]
  alt=fixed(vcf)$ALT[index]
  variants=sapply(1:length(alt),function1,alt,current_variants,ref,position)
  variants=variants[which(variants!="#")]
  return(list("sample"=sample_id[i], "variants"=variants))
}

result=as.data.frame(do.call(rbind, lapply(1:dim(vcf)[2],function2,vcf)))
fileName=paste( c(getwd(),"/MitoToolVcf2Var_",format(Sys.time(),"%m_%d_%Y.%H.%M.%S"),".txt"),sep="",collapse="")
file.create(fileName)
fileConn<-file(fileName,"w")
for( i in 1:dim(result)[1]){
  writeLines(c(paste(">",result$sample[[i]],sep=""),paste(result$variants[[i]], sep=", ",collapse=","),"\n"), fileConn)
}
close(fileConn)


step 4: Use online/stand-alone MitoTool to analyze exported file (prefix: MitoToolVcf2Var_) and determine the haplogroup status of 1000G mtDNAs
# input getwd() at R terminal to find the destination folder of exported file   
# Tips: 573+XC, 965+XC, 960+XC, 5899+XC and 8287+XC are better to be manually checked and modified.


To slightly revise this R script enables it to deal with mtDNAs of next-generation sequencing data of domestic animals (i.e., mtDNAs of VCF files of domestic animals).