# This script contains functions needed to run EM algorithm as published in Nair et al. 2014 as well as to handle # the results and create figures. # EM algorithm implementations and auxilary function em_shape_flip = function(c,q,data) { # EM algorithm to optimize classes profiles using data flipping. There are only # two flip states (sense, reversed) which do not need to be precised. K=dim(c)[1] N=dim(data)[1] l=array(dim=c(N,K,2)) p=array(dim=c(N,K,2)) for(i in 1:K) { c[i,]=c[i,]/mean(c[i,]) } rm=rowMeans(data) for(i in 1:N) { for(j in 1:K) {l[i,j,1] = sum(dpois(data[i,], c[j,] *rm[i],log=T)) } for(j in 1:K) {l[i,j,2] = sum(dpois(data[i,],rev(c[j,])*rm[i],log=T)) } } for(i in 1:N) { p[i,,] = q*exp(l[i,,] - max(l[i,,])) p[i,,] = p[i,,]/sum(p[i,,]) } q = apply(p, c(2,3), mean) c = (t(p[,,1]) %*% data) + (t(p[,,2]) %*% t(apply(data,1,rev))) c = c / apply(p,2,sum) return(list(c, q, p)) } em_shape_shift_flip = function(c,q,data) { # EM algorithm to optimize classes profiles using data flipping and shifting. The shifting paramter # is the second dimension of q. # # c a matrix (K rows x j columns) containing the K classes profiles to optimize # q the prior class probabilities # data a matrix of data where a row is a sample signal contained within j columns (bins) K = dim(c)[1] # number of classes L = dim(c)[2] # number of bins N = dim(data)[1] # number of data rows S = dim(q)[2] # shift in bins l = array(dim=c(N,K,S,2)) p = array(dim=c(N,K,S,2)) for(i in 1:K) { c[i,] = c[i,]/mean(c[i,]) } rm = matrix(nrow=N, ncol=S) for(k in 1:S) { rm[,k] = rowMeans(data[,k:(k+L-1)]) } for(i in 1:N) # classes { for(j in 1:K) # data rows { for(k in 1:S) # shift { l[i,j,k,1] = sum(dpois( data[i, k:(k+L-1)], c[j,]*rm[i, k], log=T)) # one orientation l[i,j,k,2] = sum(dpois(rev(data[i,])[k:(k+L-1)], c[j,]*rm[i, S-k+1], log=T)) # the other orientation } } } for(i in 1:N) { p[i,,,] = q*exp(l[i,,,]-max(l[i,,,])) p[i,,,] = p[i,,,]/sum(p[i,,,]) } q = apply(p, c(2,3,4), mean) c = 0 for(k in 1:S) { c = c + (t(p[,,k,1]) %*% data[,k :(k+L-1)]) + (t(p[,,k,2]) %*% t(apply(data[,(S-k+1):(S+L-k)], 1, rev))) } c = c / apply(p, 2, sum) m = sum((1:S)* apply(q,2,sum)) s = sum(((1:S) - m)**2*apply(q,2,sum))**0.5 for(i in 1:K) { q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 } return(list(c,q,p)) } # recenter shift probability distribution to force it to be a gaussian reg_shift_flip= function(q) { # This function regularizes the prior distribution of the shift probabilities to # make it a centered gaussian distribution. # This is a modified version of reg_shift() designed to handle 3D array coming # from em_shape_flip_shift() K=dim(q)[1] S=dim(q)[2] # There is no biological reason to estimate the mean and the sd from each flip state # independently. Thus the mean and sd will be computed from the whole array at once. m= sum((1:S)* apply(q,2,sum)) s=sum(((1:S)-m)**2*apply(q,2,sum))**0.5 for (i in 1:K) { q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 } return(q) } # load/dump prob array to hard drive em.write.prob <- function(P, file) { # Dumps a class probability array/matrix

as return by the EM algorithm on the hard drive, under compressed RDS # format. This function is the reciproque function of em.read.prob(). # # Returns A string containing the address of the written file # # P an array or a matrix containing the EM posteriori class probability for each datum in the original data. # file a string indicating in which file P will be dumped. saveRDS(P, file=file) return(file) } em.read.prob <- function(file) { # Reads a compressed RDS formatted dumped class probability array/matrix as return by the EM algorithm and returns it. # This function is the reciproque function of em.write.prob(). # # Returns a matrix or an array (depending whether shift was enabled during the EM run) # # file a string indicating the address of the file to read. if(!file.exists(file)) { stop(sprintf("Error! %s does not exist!", file)) } array <- readRDS(file) return(array) } # extract data from SGA files em.extract.unoriented.sga.class.max <- function(SGA, P, file, bin.size, shift=F, flip=F, class, threshold=NA) { # Extracts and updates positions from an SGA file data stored in a dataframe, given the results of EM classification