## ----style, echo = FALSE-------------------------------------------------
BiocStyle::markdown()

## ----, env, message=FALSE, echo=FALSE, cache=FALSE, warning=FALSE--------
library("Gviz")
library("Pbase")
library("Biostrings")
library("biomaRt")
library("BSgenome.Hsapiens.UCSC.hg19")
library("ggplot2")

## ----schema, echo=FALSE, fig.cap='Mapping proteins to a genome reference.', fig.align='center'----
par(mar = rep(0, 4),
    oma = rep(0, 4))

xlim <- c(0, 10)
ylim <-c(5.05, 10.25)

plot(0, type = "n", axes = FALSE,
     xlab = "", ylab = "",
     xlim = xlim,    
     ylim = ylim)

y <- 10

g <- c(0, 10)

exons <- list(c(0, 2),
              c(4.5, 5),
              c(6, 7.1),
              c(9, 10))

trans <- list(c(1, 2, 3, 4),
              c(1, 3, 4),
              c(1, 4))

prot <- list(exons[[1]] + c(0.5, 0), ## 5'UTR
             exons[[3]],
             exons[[4]] + c(0, -0.5)) ## 3'UTR

peps <- list(c(1, 1.29),
             c(6.1, 6.3), c(6.85, 7),
             c(9.1, 9.25))


rect(g[1], y-0.25, g[2], y+0.25, col = "black")
abline(h = y, lwd = 2)
text(g[1], y+0.25, expression(g[s]), pos = 3)
text(g[2], y+0.25, expression(g[e]), pos = 3)
text(0-0.26, y, "G", pos = 3)

abline(v = c(unlist(exons),
           unlist(prot),
           unlist(peps)),
       lty = "dotted",
       col = "lightgrey")

y <- y - 1
for (i in 1:length(trans)) {
    tr <- trans[[i]]
    abline(h = y, lty = "dotted")
    ## text(0-0.26, y, expression(T[x]), pos = 3)
    text(0-0.26, y,
         substitute(paste("T", list(x)), list(x = i)),
         pos = 3)
    for (j in 1:length(tr)) {
        e1 <- exons[[tr[j]]][1]
        e2 <- exons[[tr[j]]][2]
        rect(e1, y-0.25, e2, y+0.25, col = "grey")
        if (i == 1) {
            text(e1, y+0.25, expression(e[s]^i), pos = 3)
            text(e2, y+0.25, expression(e[e]^i), pos = 3)
        }
        text(mean(c(e1, e2)), y, paste0("i = ", tr[j]), cex = .7)
    }
    y <- y - .6
}

y <- y - .4

abline(h = y, lty = "dotted")
text(0-0.26, y, expression(P), pos = 3)
for (i in 1:length(prot)) {
    pr <- prot[[i]]
    rect(pr[1], y - 0.25,
         pr[2], y + 0.25,
         col = "steelblue")
    text(pr[1], y + 0.25, expression(p[s]^j), pos = 3)
    text(pr[2], y + 0.25, expression(p[e]^j), pos = 3)
    text(mean(c(pr[1], pr[2])), y, paste0("j = ", i), cex = .7)
}
y <- y - .75

## concat protein
## center
x <- mean(xlim)
protlen <- sum(sapply(prot, diff))
X0 <- x0 <- x - protlen/2 ## left start

abline(h = y, lty = "dotted")
text(0-0.26, y, expression(P), pos = 3)

for (i in 1:length(prot)) {
    pr <- prot[[i]]
    .pr1 <- x0
    .pr2 <- x0 + (pr[2] - pr[1])
    rect(.pr1, y - 0.25,
         .pr2, y + 0.25,
         col = "steelblue")
    segments(.pr1, y + 0.25, pr[1], y + .75 - .25, lty = "dotted")
    segments(.pr2, y + 0.25, pr[2], y + .75 - .25, lty = "dotted")
    text(mean(c(.pr1,.pr2)), y, paste0("j = ", i), cex = .7)
    x0 <- .pr2
}

text(X0, y, expression(1), pos = 2)
text(X0 + protlen, y, expression(L[P]), pos = 4)

## pos and length
relpeps <- list(c(0.5, 0.29),
                ## 1.% is length of exon 1
                c(1.5 + 0.1, 0.2),
                c(1.5 + 0.85, 0.15),
                ## 1.1 is length of exon 2
                c(1.5 + 1.1 + 0.1, 0.15))

for (i in 1:length(relpeps)) {
    rp <- relpeps[[i]]
    pep <- peps[[i]]
    rect(X0 + rp[1], y - 0.25,
         X0 + rp[1] + rp[2], y + 0.25,
         col = "#FFA50450", lwd = 0)
    segments(X0 + rp[1], y - 0.25,
             pep[1], y - .75 + .25, lty = "dotted")
    segments(X0 + rp[1] + rp[2], y - 0.25,
             pep[2], y - .75 + .25, lty = "dotted")
}
y <- y - .75


abline(h = y, lty = "dotted")
text(0-0.26, y, expression(Pi), pos = 3)
for (i in 1:length(peps)) {
    pep <- peps[[i]]
    rect(pep[1], y - 0.25,
         pep[2], y + 0.25,
         col = "#FFA504FF")
    text(pep[1], y + 0.25, expression(pi[s]^k), pos = 3, cex = .7)
    text(pep[2], y + 0.25, expression(pi[e]^k), pos = 3, cex = .7)
    text(mean(c(pep[1], pep[2])), y-0.35, paste0("k = ", i), cex = .8)
}

## ----, echo=FALSE--------------------------------------------------------
data(p)
sp <- seqnames(p)[5]

## ----, p, message=FALSE--------------------------------------------------
library("Pbase")
data(p)
p
seqnames(p)
dat <- data.frame(i = 1:length(p),
                  npeps = elementLengths(pfeatures(p)),
                  protln = width(aa(p)))
dat
sp <- seqnames(p)[5]

## ----bm, cache=TRUE, message=FALSE---------------------------------------
library("biomaRt")
ens <- useMart("ensembl", "hsapiens_gene_ensembl")

bm <- select(ens, keys = sp,
             keytype = "uniprot_swissprot_accession",
             columns = c(
                 "ensembl_gene_id",
                 "ensembl_transcript_id",
                 "ensembl_peptide_id",
                 "ensembl_exon_id",  
                 "description",     
                 "chromosome_name",
                 "strand",
                 "exon_chrom_start",
                 "exon_chrom_end",
                 "5_utr_start",
                 "5_utr_end",
                 "3_utr_start",
                 "3_utr_end",
                 "start_position",     
                 "end_position"))

bm


## ------------------------------------------------------------------------
bm <- bm[bm$chromosome_name == "X", ]
tr <- unique(bm$ensembl_transcript_id)
pr <- unique(bm$ensembl_peptide_id)

## ----gviz, cache=TRUE, message=FALSE-------------------------------------
library("Gviz")
options(ucscChromosomeNames=FALSE)

bmTrack <- BiomartGeneRegionTrack(start = min(bm$start_position),
                                  end = max(bm$end_position),
                                  chromosome = "chrX",
                                  genome = "hg19")

## see also the filters param and getBM to
## filter on biotype == "protein_coding"

bmTracks <- split(bmTrack, transcript(bmTrack))

grTrack <- bmTracks[[which(names(bmTracks) == tr)]]
names(grTrack) <- tr

## ----gvizdfr-------------------------------------------------------------
as(grTrack, "data.frame")

## ----gvizfig, fig.align='center', cache=TRUE-----------------------------
ideoTrack <- IdeogramTrack(genome = "hg19",
                           chromosome = "chrX")
axisTrack <- GenomeAxisTrack()
seqlevels(ranges(grTrack)) <- 
    chromosome(grTrack) <-
        chromosome(ideoTrack)
plotTracks(list(ideoTrack, axisTrack, grTrack),
           add53 = TRUE, add35 = TRUE)

## ----, txdb, cache=TRUE, message=FALSE-----------------------------------
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txTr <- GeneRegionTrack(txdb, chromosome = "chrX",
                        start = bm$start_position[1],
                        end = bm$end_position[1])

head(as(txTr, "data.frame"))

## ----, pp0, echo=FALSE---------------------------------------------------
pp <- p[sp]
n <- elementLengths(pranges(pp))
pepstring <- paste(unique(pcols(pp)[[1]]$pepseq), collapse = ", ")

## ----, pepsfig, fig.align='center'---------------------------------------
pp <- p[sp]
pranges(pp)
plot(pp)

## ----, protranges--------------------------------------------------------
(prng <- ranges(grTrack[feature(grTrack) == "protein_coding",]))

## ----, extractat, eval=FALSE---------------------------------------------
#  library("Biostrings")
#  chrx <- readDNAStringSet("./Homo_sapiens.GRCh37.75.dna.chromosome.X.fa.gz")
#  seq <- extractAt(chrx[[1]], ranges(prng))
#  pptr <- translate(unlist(seq))

## ----, protfromgenome----------------------------------------------------
ucsc <- select(ens, key = pr,
               keytype = "ensembl_peptide_id",
               columns = "ucsc")
ucsc

library("BSgenome.Hsapiens.UCSC.hg19")
prng2 <- ranges(txTr[transcript(txTr) %in% ucsc])
prng2 <- prng2[prng2$feature == "CDS"]

seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, prng2)
pptr <- translate(unlist(seq))
## remove stop codon
pptr <- pptr[1:417]

## ----, align-------------------------------------------------------------
writePairwiseAlignments(pairwiseAlignment(pp[[1]], pptr))

## ----, mapping-----------------------------------------------------------
## exons ranges along the protein
j <- cumsum(width(seq))
i <- cumsum(c(1, width(seq)))[1:length(j)]
prex <- IRanges(start = i, end = j)

peprng <- pranges(pp)[[1]]
## peptide positions on cdna
peprng2 <- IRanges(start = 1 + (start(peprng)-1) * 3,
                   width = width(peprng) * 3)

## find exon and position in prex 
start_ex <- subjectHits(findOverlaps(start(peprng2), prex))
end_ex <- subjectHits(findOverlaps(end(peprng2), prex))

getPos <- function(p, i, prtex = prex, nclex = prng2) {
    ## position in cdna
    ## exon index
    start(prng2[i]) + (p - start(prtex[i]))
}

peptides_on_genome <- IRanges(start = getPos(start(peprng2), start_ex),
                              end = getPos(end(peprng2), end_ex))


## ----, pepcoords, fig.align='center'-------------------------------------
pepTr <- AnnotationTrack(start = start(peptides_on_genome),
                         end = end(peptides_on_genome),
                         chr = "chrX", genome = "hg19",
                         strand = "*",
                         id = pcols(pp)[[1]]$pepseq,
                         name = "pfeatures",
                         col = "steelblue")


plotTracks(list(ideoTrack, axisTrack, grTrack, pepTr),
           groupAnnotation = "id",
           just.group = "below",
           fontsize.group = 9,
           add53 = TRUE, add35 = TRUE)


## ----, msmsspectra, fig.align='center'-----------------------------------
data(pms)

library("ggplot2")
details <- function(identifier, ...) {
    p <- plot(pms[[as.numeric(identifier)]], full=TRUE, plot=FALSE) + ggtitle("") 
    p <- p + theme_bw() + theme(axis.text.y = element_blank(),
                                axis.text.x = element_blank()) + 
                                labs(x = NULL, y = NULL)
                                        
    print(p, newpage=FALSE)
}

deTrack <- AnnotationTrack(start = start(peptides_on_genome),
                           end = end(peptides_on_genome),
                           genome = "hg19", chromosom = "chrX",
                           id = pcols(pp)[[1]]$acquisitionnum,
                           name = "MS2 spectra",
                           stacking = "squish", fun = details)

plotTracks(list(ideoTrack, axisTrack, deTrack, grTrack),
           add53 = TRUE, add35 = TRUE)

## ----, si----------------------------------------------------------------
sessionInfo()

