## ---- echo = FALSE,hide=TRUE, message=FALSE,warning=FALSE------------------
library(ELMER.data)
library(DT)
library(dplyr)

## ---- eval = FALSE---------------------------------------------------------
#  devtools::install_github(repo = "tiagochst/ELMER.data")
#  library("ELMER.data")
#  library("GenomicRanges")

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  for(plat in c("450K","EPIC")) {
#    for(genome in c("hg38","hg19")) {
#      base <- "http://zwdzwd.io/InfiniumAnnotation/current/"
#      if(plat == "EPIC") {
#        annotation <- paste0(base,"EPIC/EPIC.manifest.rda")
#      } else {
#        annotation <- paste0(base,"hm450/hm450.manifest.rda")
#      }
#      if(genome == "hg38") annotation <- gsub(".rda",".hg38.rda", annotation)
#      if(!file.exists(basename(annotation))) {
#        if(Sys.info()["sysname"] == "Windows") mode <- "wb" else  mode <- "w"
#        downloader::download(annotation, basename(annotation), mode = mode)
#      }
#    }
#  }

## ---- message=FALSE--------------------------------------------------------
data("EPIC.manifest")
as.data.frame(EPIC.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("EPIC.manifest.hg38")
as.data.frame(EPIC.manifest.hg38)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.manifest.hg38")
as.data.frame(hm450.manifest.hg38)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.manifest")
as.data.frame(hm450.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  library(xml2)
#  library(httr)
#  library(dplyr)
#  library(rvest)
#  createMotifRelevantTfs <- function(classification = "family"){
#    message("Accessing hocomoco to get last version of TFs ", classification)
#    file <- paste0(classification,".motif.relevant.TFs.rda")
#  
#      # Download from http://hocomoco.autosome.ru/human/mono
#      tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
#      tf.family <- tf.family[[1]]
#      # Split TF for each family, this will help us map for each motif which are the some ones in the family
#      # basicaly: for a TF get its family then get all TF in that family
#      col <- ifelse(classification == "family", "TF family","TF subfamily")
#      family <- split(tf.family,f = tf.family[[col]])
#  
#      motif.relevant.TFs <- plyr::alply(tf.family,1, function(x){
#        f <- x[[col]]
#        if(f == "") return(x$`Transcription factor`) # Case without family, we will get only the same object
#        return(unique(family[as.character(f)][[1]]$`Transcription factor`))
#      },.progress = "text")
#      #names(motif.relevant.TFs) <- tf.family$`Transcription factor`
#      names(motif.relevant.TFs) <- tf.family$Model
#      # Cleaning object
#      attr(motif.relevant.TFs,which="split_type") <- NULL
#      attr(motif.relevant.TFs,which="split_labels") <- NULL
#    return(motif.relevant.TFs)
#  }
#  TF.family <- createMotifRelevantTfs("family")
#  TF.subfamily <- createMotifRelevantTfs("subfamily")
#  save(TF.family,file = "TF.family.rda", compress = "xz")
#  save(TF.subfamily,file = "TF.subfamily.rda", compress = "xz")

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  hocomoco.table <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
#  hocomoco.table <- hocomoco.table[[1]]
#  save(hocomoco.table,file = "data/hocomoco.table.rda", compress = "xz")

## --------------------------------------------------------------------------
data("Probes.motif.hg19.450K")
dim(Probes.motif.hg19.450K)
str(Probes.motif.hg19.450K)

## --------------------------------------------------------------------------
data("Probes.motif.hg38.450K")
dim(Probes.motif.hg38.450K)
str(Probes.motif.hg38.450K)

## --------------------------------------------------------------------------
data("Probes.motif.hg19.EPIC")
dim(Probes.motif.hg19.EPIC)
str(Probes.motif.hg19.EPIC)

## --------------------------------------------------------------------------
data("Probes.motif.hg38.EPIC")
dim(Probes.motif.hg38.EPIC)
str(Probes.motif.hg38.EPIC)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  getInfiniumAnnotation <- function(plat = "450K", genome = "hg38"){
#    message("Loading object: ",file)
#    newenv <- new.env()
#    if(plat == "EPIC" & genome == "hg19") data("EPIC.manifest", package = "ELMER.data",envir=newenv)
#    if(plat == "EPIC" & genome == "hg38") data("EPIC.manifest.hg38", package = "ELMER.data",envir=newenv)
#    if(plat == "450K" & genome == "hg19") data("hm450.manifest", package = "ELMER.data",envir=newenv)
#    if(plat == "450K" & genome == "hg38") data("hm450.manifest.hg38", package = "ELMER.data",envir=newenv)
#    annotation <- get(ls(newenv)[1],envir=newenv)
#    return(annotation)
#  }
#  # To find for each probe the know motif we will use HOMER software (http://homer.salk.edu/homer/)
#  # Step:
#  # 1 - get DNA methylation probes annotation with the regions
#  # 2 - Make a bed file from it
#  # 3 - Execute section: Finding Instance of Specific Motifs from http://homer.salk.edu/homer/ngs/peakMotifs.html to the HOCOMOCO TF motifs
#  # Also, As HOMER is using more RAM than the available we will split the files in to 100k probes.
#  # Obs: for each probe we create a winddow of 500 bp (-size 500) around it. This might lead to false positives, but will not have false negatives.
#  # The false posives will be removed latter with some statistical tests.
#  TFBS.motif <- "http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif"
#  if(!file.exists(basename(TFBS.motif))) downloader::download(TFBS.motif,basename(TFBS.motif))
#  for(plat in c("450K","EPIC")){
#    for(gen in c("hg19","hg38")){
#  
#        file <- paste0(plat,gen,".txt")
#        print(file)
#        if(!file.exists(file)){
#          # STEP 1
#          gr <- getInfiniumAnnotation(plat = plat,genome =  gen)
#  
#          # This will remove masked probes. They have poor quality and might be arbitrarily positioned (Wanding Zhou)
#          gr <- gr[!gr$MASK.general]
#  
#          df <- data.frame(seqnames=seqnames(gr),
#                           starts=as.integer(start(gr)),
#                           ends=end(gr),
#                           names=names(gr),
#                           scores=c(rep(".", length(gr))),
#                           strands=strand(gr))
#          step <- 10000 # nb of lines in each file. 10K was selected to not explode RAM
#          n <- nrow(df)
#          for(j in 0:floor(n/step)){
#            # STEP 2
#            file.aux <- paste0(plat,gen,"_",j,".bed")
#            if(!file.exists(gsub(".bed",".txt",file.aux))){
#              end <- ifelse(((j + 1) * step) > n, n,((j + 1) * step))
#              write.table(df[((j * step) + 1):end,], file = file.aux, col.names = F, quote = F,row.names = F,sep = "\t")
#  
#              # STEP 3 use -mscore to get scores
#              cmd <- paste("source ~/.bash_rc; annotatePeaks.pl" ,file.aux, gen, "-m", basename(TFBS.motif), "-size 500 -cpu 12 >", gsub(".bed",".txt",file.aux))
#              system(cmd)
#            }
#          }
#        }
#        # We will merge the results from each file into one
#        peaks <- NULL
#        for(j in 0:floor(n/step)){
#          aux <-  readr::read_tsv(paste0(plat,gen,"_",j,".txt"))
#          colnames(aux)[1] <- "PeakID"
#          if(is.null(peaks)) {
#            peaks <- aux
#          } else {
#            peaks <- rbind(peaks, aux)
#          }
#        }
#        readr::write_tsv(peaks,path=file,col_names = TRUE)
#        print("DONE!")
#        gc()
#      }
#    }
#  }
#  
#  getMatrix <- function(filename) {
#    motifs <- readr::read_tsv(file)
#    # From 1 to 21 we have annotations then we have 640 motifs
#    matrix <- Matrix::Matrix(0, nrow = nrow(motifs), ncol = ncol(motifs) - 21 ,sparse = TRUE)
#    colnames(matrix) <- gsub(" Distance From Peak\\(sequence,strand,conservation\\)","",colnames(motifs)[-c(1:21)])
#    rownames(matrix) <- motifs$PeakID
#    matrix[!is.na(motifs[,-c(1:21)])] <- 1
#    matrix <- as(matrix, "nsparseMatrix")
#    return(matrix)
#  }
#  
#  # This code will read the table with the motifs, save it as a sparce matrix
#  # and save all as a .rda that will be placed in ELMER.data
#  getMatrix <- function(filename) {
#    motifs <- readr::read_tsv(file)
#    # From 1 to 21 we have annotations then we have 640 motifs
#    matrix <- Matrix::Matrix(0, nrow = nrow(motifs), ncol = 640,sparse = TRUE)
#    colnames(matrix) <- gsub(" Distance From Peak\\(sequence,strand,conservation\\)","",colnames(motifs)[-c(1:21)])
#    rownames(matrix) <- motifs$PeakID
#    matrix[!is.na(motifs[,-c(1:21)])] <- 1
#    matrix <- as(matrix, "nsparseMatrix")
#    return(matrix)
#  }
#  for(plat in c("EPIC", "450K")){
#    for(gen in c("hg19","hg38")){
#      file <- paste0(plat,gen,".txt")
#      if(file == "450Khg19.txt"){
#        Probes.motif.hg19.450K <- getMatrix(file)
#        save(Probes.motif.hg19.450K, file = "Probes.motif.hg19.450K.rda", compress = "xz")
#        rm(Probes.motif.hg19.450K)
#      }
#      if(file == "450Khg38.txt"){
#        Probes.motif.hg38.450K <- getMatrix(file)
#        save(Probes.motif.hg38.450K, file = "Probes.motif.hg38.450K.rda", compress = "xz")
#        rm(Probes.motif.hg38.450K)
#      }
#  
#      if(file == "EPIChg19.txt"){
#        Probes.motif.hg19.EPIC <- getMatrix(file)
#        save(Probes.motif.hg19.EPIC, file = "Probes.motif.hg19.EPIC.rda", compress = "xz")
#        rm(Probes.motif.hg19.EPIC)
#      }
#  
#      if(file == "EPIChg38.txt"){
#        Probes.motif.hg38.EPIC <- getMatrix(file)
#        save(Probes.motif.hg38.EPIC, file = "Probes.motif.hg38.EPIC.rda", compress = "xz")
#        rm(Probes.motif.hg38.EPIC)
#      }
#    }
#  }

## --------------------------------------------------------------------------
data("Probes.motif.hg19.450K")
as.data.frame(as.matrix(Probes.motif.hg19.450K[1:20,1:20])) %>% 
  datatable(options = list(scrollX = TRUE,pageLength = 5)) 

## ----sessionInfo-----------------------------------------------------------
sessionInfo()

