data <- read.csv("toDF_27feb.tsv", sep="\t", header = T)

classes <- paste("Class", 1:5)

selected <- data[data$InSiGHT_Class %in% classes,"InSiGHT_Class"]
#priors <- table(selected) / length(selected)

# Uniform (uninformative) prior
priors <- rep(1 / length(classes), length(classes))
names(priors) <- classes

likelihood <- function(class) {
    kernDens <- density(data[data[["InSiGHT_Class"]] == class, "CADD_phred"], from = 0, to = 50, na.rm=T)
    approxfun(kernDens$x, kernDens$y)
}

library('logspline')
logspline.likelihood <- function(class) {
    lsp <- logspline(na.omit(data[data$InSiGHT_Class == class, "CADD_phred"]))
    function(score) {
        dlogspline(score, fit=lsp)
    }
}

likelihoods <- lapply(classes, likelihood)
names(likelihoods) <- classes

normalize <- function(score) {
    sum(sapply(classes, function(class) {
        likelihoods[[class]](score) * priors[[class]]
    }))
}

posterior <- function(score, class) {
    likelihoods[[class]](score) * priors[[class]] / normalize(score)
}

tradeoffPlot <- function(classes) {
    range <- 0:50
    plot(range
       , sapply(range, function(score) posterior(score, classes[[1]]))
       , type="l"
       , ylim = c(0,1)
       , col=1
       , xlab="CADD Score"
       , ylab="P(x|x~C)")

    for(class in classes[-1]) {
        lines(range, sapply(range, function(score) posterior(score, class)), col=which(classes == class))
    }

    legend("topright", classes, col=1:length(classes), bty='n', cex=.75, lty=1)
}

grDevices::pdf(file="~/Desktop/tradeoff.pdf")
tradeoffPlot(classes)
dev.off()


densityPlot <- function(class) {
    data <- data[data[["InSiGHT_Class"]] == class, "CADD_phred"]
    kernDens <- density(data, from=0, to = 50, na.rm=T)
    # logSpline <- logspline(na.omit(data))
    plot(kernDens, lty=1, ylim=c(0,0.1), main=class)
    # lines(dlogspline(1:50, logSpline), lty=2)
    rug(data, col="red")
    # legend("topright", c("Kernel Density", "Logspline"), bty='n', cex=.75, lty=c(1,2))
}

grDevices::pdf(file="~/Desktop/density.pdf")
for(class in classes) densityPlot(class)
dev.off()

# lets calculate p(x|x~D) for each D and x in the data
t(sapply(data$CADD_phred, function(score) {
    sapply(classes, function(class) { posterior(score, class) })
})) -> probs
colnames(probs) <- paste("p~", classes, sep="")

cbind(data, probs) -> data.with.probs

least.likely <- function(data, class, probs=c(0.05, .95)) {
    classData <-  data.with.probs[data.with.probs[["InSiGHT_Class"]] == class,]
    CADD.posterior <- paste("p~", class, sep="")
    quantile(classData[, CADD.posterior], probs=probs, na.rm = T) -> outliers
    classData[classData[[CADD.posterior]] < outliers[1],]
}

# Write "least likely" files
for(class in classes) {
    fileName <- paste("least_likely-", gsub("[[:punct:]]", "", class), ".tsv", sep="")
    df <-  least.likely(data.with.probs, class)
    write.table(file = fileName
              , x = df[apply(df,1,function(x)any(!is.na(x))),] # Drop the NA only rows (why are they there)
              , sep = "\t"
              , row.names = F)
}

# get the "misclassification"
x <- data.with.probs[,grep('p~',names(data.with.probs))]
apply(x, 1, function(y) { m <- which.max(y); ifelse(!length(m) != 0, "?", gsub("p~", "", names(x)[[m]])) } ) -> CADDClass
cbind(data.with.probs, "CADD" = CADDClass) -> data.tmp
data.tmp[as.character(data.tmp$InSiGHT_Class) != data.tmp$CADD ,] -> miss
miss <- miss[miss$CADD != "?",]

# Table with all the stuff
write.table(file = "potential_misclassification.tsv"
            , x = miss[apply(miss,1,function(x)any(!is.na(x))),] # Drop the NA only rows (why are they there)
            , sep = "\t"
            , row.names = F)

# Table with just the Class 1 - 5
extreme <- miss[(miss$InSiGHT_Class == "Class 5" & miss$CADD == "Class 1") | (miss$InSiGHT_Class == "Class 1" & miss$CADD == "Class 5"),]

write.table(file = "potential_extreme_misclassification.tsv"
            , x = extreme[apply(extreme,1,function(x)any(!is.na(x))),]
            , sep = "\t"
            , row.names = F)


data.tmp <- data[data$InSiGHT_Class %in% classes, ]
data.tmp$InSiGHT_Class <- factor(data.tmp$InSiGHT_Class)

cor.test(as.numeric(data.tmp$InSiGHT_Class), round(data.tmp$CADD_phred), method="spearman")

library("stringr")
as.factor(sapply(data.tmp$Effect_raw, function(eff) { str_match(eff, "([0-9A-Z_]*)\\((.*)\\)")[[2]] })) -> effects
data.tmp <- cbind(data.tmp, Effect = effects)


library("rjags")
