# This script contains functions needed to run EM algorithm as published in Nair et al. 2014 as well as to handle
# the results and create figures.
# EM algorithm implementations and auxilary function
em_shape_flip = function(c,q,data)
{ # EM algorithm to optimize classes profiles using data flipping. There are only
# two flip states (sense, reversed) which do not need to be precised.
K=dim(c)[1]
N=dim(data)[1]
l=array(dim=c(N,K,2))
p=array(dim=c(N,K,2))
for(i in 1:K) { c[i,]=c[i,]/mean(c[i,]) }
rm=rowMeans(data)
for(i in 1:N)
{ for(j in 1:K) {l[i,j,1] = sum(dpois(data[i,], c[j,] *rm[i],log=T)) }
for(j in 1:K) {l[i,j,2] = sum(dpois(data[i,],rev(c[j,])*rm[i],log=T)) }
}
for(i in 1:N)
{ p[i,,] = q*exp(l[i,,] - max(l[i,,]))
p[i,,] = p[i,,]/sum(p[i,,])
}
q = apply(p, c(2,3), mean)
c = (t(p[,,1]) %*% data) + (t(p[,,2]) %*% t(apply(data,1,rev)))
c = c / apply(p,2,sum)
return(list(c, q, p))
}
em_shape_shift_flip = function(c,q,data)
{ # EM algorithm to optimize classes profiles using data flipping and shifting. The shifting paramter
# is the second dimension of q.
#
# c a matrix (K rows x j columns) containing the K classes profiles to optimize
# q the prior class probabilities
# data a matrix of data where a row is a sample signal contained within j columns (bins)
K = dim(c)[1] # number of classes
L = dim(c)[2] # number of bins
N = dim(data)[1] # number of data rows
S = dim(q)[2] # shift in bins
l = array(dim=c(N,K,S,2))
p = array(dim=c(N,K,S,2))
for(i in 1:K)
{ c[i,] = c[i,]/mean(c[i,]) }
rm = matrix(nrow=N, ncol=S)
for(k in 1:S)
{ rm[,k] = rowMeans(data[,k:(k+L-1)]) }
for(i in 1:N) # classes
{ for(j in 1:K) # data rows
{ for(k in 1:S) # shift
{ l[i,j,k,1] = sum(dpois( data[i, k:(k+L-1)], c[j,]*rm[i, k], log=T)) # one orientation
l[i,j,k,2] = sum(dpois(rev(data[i,])[k:(k+L-1)], c[j,]*rm[i, S-k+1], log=T)) # the other orientation
}
}
}
for(i in 1:N)
{ p[i,,,] = q*exp(l[i,,,]-max(l[i,,,]))
p[i,,,] = p[i,,,]/sum(p[i,,,])
}
q = apply(p, c(2,3,4), mean)
c = 0
for(k in 1:S)
{ c = c + (t(p[,,k,1]) %*% data[,k :(k+L-1)])
+ (t(p[,,k,2]) %*% t(apply(data[,(S-k+1):(S+L-k)], 1, rev)))
}
c = c / apply(p, 2, sum)
m = sum((1:S)* apply(q,2,sum))
s = sum(((1:S) - m)**2*apply(q,2,sum))**0.5
for(i in 1:K)
{ q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 }
return(list(c,q,p))
}
# recenter shift probability distribution to force it to be a gaussian
reg_shift_flip= function(q)
{ # This function regularizes the prior distribution of the shift probabilities to
# make it a centered gaussian distribution.
# This is a modified version of reg_shift() designed to handle 3D array coming
# from em_shape_flip_shift()
K=dim(q)[1]
S=dim(q)[2]
# There is no biological reason to estimate the mean and the sd from each flip state
# independently. Thus the mean and sd will be computed from the whole array at once.
m= sum((1:S)* apply(q,2,sum))
s=sum(((1:S)-m)**2*apply(q,2,sum))**0.5
for (i in 1:K)
{ q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 }
return(q)
}
# load/dump prob array to hard drive
em.write.prob <- function(P, file)
{ # Dumps a class probability array/matrix as return by the EM algorithm on the hard drive, under compressed RDS
# format. This function is the reciproque function of em.read.prob().
#
# Returns A string containing the address of the written file
#
# P an array or a matrix containing the EM posteriori class probability for each datum in the original data.
# file a string indicating in which file P will be dumped.
saveRDS(P, file=file)
return(file)
}
em.read.prob <- function(file)
{ # Reads a compressed RDS formatted dumped class probability array/matrix as return by the EM algorithm and returns it.
# This function is the reciproque function of em.write.prob().
#
# Returns a matrix or an array (depending whether shift was enabled during the EM run)
#
# file a string indicating the address of the file to read.
if(!file.exists(file))
{ stop(sprintf("Error! %s does not exist!", file)) }
array <- readRDS(file)
return(array)
}
# extract data from SGA files
em.extract.unoriented.sga.class.max <- function(SGA, P, file, bin.size, shift=F, flip=F, class, threshold=NA)
{ # Extracts and updates positions from an SGA file data stored in a dataframe, given the results of EM classification

if the positions
# were assigned to class with a probability higher than for any other class (and if specified if this probabilities
# are bigger than , where threshold should be > 0.5). The extracted positions are then updated if needed and
# written to a file .
#
# Extracted SGA coordinates are updates in 3 ways :
# 1) EM was run allowing flip. For a given datum, for the class of interet, if the major flip state is forward, strand is
# set to '+', otherwise '-'.
# 2) EM was run allowing shift. For a given datum, for the class of interest, the positions are shifted by a value
# corresponding to the weighted average (knowing ) of all the shift states for this class.
# 3) EM was run allowing shift and flip. For a given datum, for the class of interest, if the major flip state is forward,
# the shift uodate is the shift weighted average and the strand is set to '+'. Otherwise, the shift update is the
# weighted average sign is reversed (+20bp -> -20bp) and the strand is set to '-'.
#
# SGA a dataframe containing the data of the sga file of interest.
# P a numerical array, the results of the EM classification.
# file a string, the path where the extracted data will be written.
# bin.size an int, the em algorithm matrix input bin size
# shift a boolean indicating whether shift was allowed.
# flip a boolean indicating whether flip was allowed.
# class an int, the class to extract features from.
# threshold a float, optonal, the minimal class probability for a feature
# to be extracted.
d = length(dim(P))
n.datum = dim(P)[1]
n.class = dim(P)[2]
# check whether PROB and filtered SGA correspond, they should have the same number of rows
if(n.datum != nrow(SGA))
{ stop(sprintf("Error! Prob array and sga dataframe do not have the same number of rows (%d vs %d)!", n.datum, nrow(SGA))) }
# check whether file directory exists
if(!dir.exists(dirname(file)))
{ stop(sprintf("Error! Directory %s does not exist!", dirname(file))) }
# check bin size is > 0
if(bin.size <= 0)
{ stop(sprintf("Error! Bin size should be > 0!")) }
# check class is an int > 0
if(class <= 0 || class > n.class)
{ stop(sprintf("Error! Class number is incorrect (%d classes detected)", n.class)) }
# check threshold if given is > 0.5 assign a row to one class only
if(!is.na(threshold) && threshold <= 0.5)
{ stop(sprintf("Error! Threshold should be strictly bigger than 0.5 to unambigously assign one major class per datum!")) }
# no shift nor flip : no coord update needed
if((!shift && !flip))
{ # check dimension
if(d != 2)
{ stop(sprintf("Error! Without shift nor flip, probability array is expected to have 2 dimensions (%d found)!", d)) }
if((!shift && flip) && (d != 3))
{ stop(sprintf("Error! With flip, probability array is expected to have 3 dimensions (%d found)!", d)) }
# order class prob by decreasing order and retrieve class with highest prob
CLASS.ORDER = t(apply(P, 1, order, decreasing=T))
CLASS.MAX = CLASS.ORDER[,1]
SELECTED.ROWS = which(CLASS.MAX == class)
# threshold
if(!is.na(threshold))
{ PASS.THRESHOLD = P[SELECTED.ROWS,class] > threshold
SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD]
PASS.THRESHOLD = NULL
}
# rewrite sga file
rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND=NULL)
CLASS.ORDER = CLASS.MAX = SELECTED.ROWS = NULL
}
# shift position update : for each datum, weighted average of all shifts for the most likely class
else if(shift && !flip)
{ if(d != 3)
{ stop(sprintf("Error! With shift, probability array is expected to have 3 dimensions (%d found)!", d)) }
# compute how each site should be moved to accound to the shifting probabilities
# The shifting for each datum is computed as follows :
# For a datum, the class probabilities is defined as the sum of the prob over all the shift states.
# For a datum, the most likely class only is considered and a weighted average of the shift is computed accounting for
# each shifting state probability.
POS.UPDATE = compute.pos.update.shift(P, bin.size)
# compute datum prob as the sum of the probabilities over all the shift states
CLASS.PROB = apply(P,c(1,2),sum)
CLASS.MAX = apply(CLASS.PROB,1,order, decreasing=T)[1,]
SELECTED.ROWS = which(CLASS.MAX == class)
# threshold
if(!is.na(threshold))
{ PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold
SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD]
PASS.THRESHOLD = NULL
}
# select POS.UPDATE for kept datum
POS.UPDATE = POS.UPDATE[SELECTED.ROWS]
# rewrite sga file
rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE, STRAND=NULL)
CLASS.PROB = CLASS.MAX = SELECTED.ROWS = NULL
}
# flip position update : for each datum, check the major flip state and set strand accordingly
else if(!shift && flip)
{ # check dimension
if(d != 3)
{ stop(sprintf("Error! With flip, probability array is expected to have 3 dimensions (%d found)!", d)) }
# class probability is the sum over the flip states
CLASS.PROB = apply(P, c(1,2), sum)
# order class prob by decreasing order and retrieve class with highest prob
CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing=T))[,1]
SELECTED.ROWS = which(CLASS.MAX == class)
# for each datum, for the most likely class, find most likely flip
FLIP.PROB = matrix(nrow=n.datum, ncol=2)
for(i in 1:n.datum)
{ FLIP.PROB[i,1] = P[i,CLASS.MAX[i],1]
FLIP.PROB[i,2] = P[i,CLASS.MAX[i],2]
}
FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1]
STRAND = as.character(FLIP.MAX)
STRAND[STRAND == '2'] = '-'
STRAND[STRAND == '1'] = '+'
# threshold
if(!is.na(threshold))
{ PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold
SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD]
PASS.THRESHOLD = NULL
}
# select STRAND for kept datum
STRAND = STRAND[SELECTED.ROWS]
# rewrite sga file
rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND)
CLASS.ORDER = CLASS.MAX = SELECTED.ROWS = STRAND = NULL
}
# shift and flip position update : for each datum, weighted average of forward OR reverse shifts for the most likely class
# and most likely flip.
else if(shift && flip)
{ if(d != 4)
{ stop(sprintf("Error! With shift, probability array is expected to have 4 dimensions (%d found)!", d)) }
# # compute how each site should be moved to accound to the shifting probabilities
# The shifting for each datum is computed as follows :
# For a datum, the class probabilities are defined as the sum of the prob over all the shift states.
# For a datum, the flip probabilities are defined as the sum of the prob over the shift states
# For a datum, the most likely class only is considered and then the most likely flip state is considered.
# If the most likely flip is forward then the weighted average is made using the forward flip shift probabilities only.
# If the most likely flip is reverse then the weighted average is made using the reverse flip shift probabilities only.
POS.UPDATE = compute.pos.update.shift.flip(P, bin.size)
# class probability is the sum over the shift and flip states
CLASS.PROB = apply(P,c(1,2),sum)
# order class prob by decreasing order and retrieve class with highest prob
CLASS.MAX = apply(CLASS.PROB,1,order, decreasing=T)[1,]
SELECTED.ROWS = which(CLASS.MAX == class)
# for each datum, for the most likely class, most likely flip is the sum over the shift prob
FLIP.PROB = matrix(nrow=n.datum, ncol=2)
for(i in 1:n.datum)
{ FLIP.PROB[i,1] = sum(P[i,CLASS.MAX[i],,1])
FLIP.PROB[i,2] = sum(P[i,CLASS.MAX[i],,2])
}
FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1]
STRAND = as.character(FLIP.MAX)
STRAND[STRAND == '1'] = '+'
STRAND[STRAND == '2'] = '-'
# threshold
if(!is.na(threshold))
{ PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold
SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD]
PASS.THRESHOLD = NULL
}
# select POS.UPDATE and STRAND for kept datum
POS.UPDATE = POS.UPDATE[SELECTED.ROWS]
STRAND = STRAND[SELECTED.ROWS]
# rewrite sga file
rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE, STRAND)
CLASS.PROB = CLASS.MAX = SELECTED.ROWS = NULL
}
SGA = FILERED.ROWS = NULL
}
rewrite.sga <- function(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND=NULL)
{ # Given a dataframe containing an SGA file data and a vector of line index of interest ,
# this function updates line positions according to and
# the strand using , if needed. The results are written in a file .
#
# SGA a dataframe containing the data of the SGA file of interest
# file a string, the file in which the results will be written.
# SELECTED.ROWS a vector of int, the line index of to rewrite.
# POS.UPDATE a vector of int, if given, the new position values for lines.
# STRAND a vector of char, if given, the new value strand values in lines.
SGA = SGA[SELECTED.ROWS,]
# update sga positions if needed
if(!is.null(POS.UPDATE))
{ # checks whether POS.UPDATE and SELECTED.ROWS corresponds (should have the same length)
if(length(POS.UPDATE) != length(SELECTED.ROWS))
{ stop(sprintf("Error! Number of position updates differs from number of row to extract (%d vs %d)!", length(POS.UPDATE), length(SELECTED.ROWS))) }
SGA[,3] = SGA[,3] + POS.UPDATE
}
# update sga strand if needed
if(!is.null(STRAND))
{ SGA[,4] = STRAND }
# write SGA
write.table(x=SGA, file=file, quote=F, sep='\t', row.names=F, col.names=F)
SGA = NULL
}
compute.pos.update.shift <- function(P, bin.size)
{ # Given an array of probabilities fom EM classification using shift, this
# function compute the weighted average shift for each datum, considering
# only the major class of the datum.
#
# P a 3D array (n,k,s) of probabilities as returned by the EM algorithm.
# n the number of sites, k the number of classes, s the number of shift
# bin.size an int, the size of the vector count bin given to the EM, in bp.
#
# returns a vector of int, the average shift for each datum.
n.datum = dim(P)[1]
n.class = dim(P)[2]
n.shift = dim(P)[3]
# position difference in bp for each possible shift state
SHIFT.DIFF = (-n.shift/2+(1:n.shift)-0.5)*bin.size
# for each datum, the new class probs is the sum over the shift states probs
# for each datum, select the class with the highest probability
CLASS.PROB = apply(P,c(1,2),sum)
CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing = T))[,1]
# compute weighted average shift for each datum
# for each datum, compute the weighted average shifting for the most likely class
# i.g. a datum with 2 classes of prob 0.3 and 0.7 -> class 2 is the most likely
# for class 2, shift states prob are 0.2, 0.8, 0
# bin.size is 10 thus the shift states are -10bp, 0bp, +10bp
# weighted shift average is (0.2*-10) + (0.8*0) + (0*10) = -2bp
SHIFT.WEIGHT = vector(length=n.datum, mode="numeric")
for(i in 1:n.datum)
{ SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],] %*% SHIFT.DIFF) }
SHIFT.DIFF = CLASS.PROB = CLASS.MAX = NULL
return(SHIFT.WEIGHT)
}
compute.pos.update.shift.flip <- function(P, bin.size)
{ # Given an array of probabilities from EM classification using shift and flip,
# this function compute the weighted average shift for each datum, considering
# only the major class of the datum.
#
# P a 4D array (n,k,s,2) of probabilities as returned by the EM algorithm.
# n the number of sites, k the number of classes, s the number of shift
# states, 2 the number of flip states
# bin.size an int, the size of the vector count bin given to the EM, in bp.
#
# returns a vector of int, the average shift for each datum.
n.datum = dim(P)[1]
n.class = dim(P)[2]
n.shift = dim(P)[3]
# position difference in bp for each possible shift state, for both flip states
SHIFT.DIFF.fw = (-n.shift/2+(1:n.shift)-0.5)*bin.size
SHIFT.DIFF.rev = (n.shift/2 -(1:n.shift)+0.5)*bin.size
# for each datum, the new class probs is the sum over the shift and flip states probs
# for each datum, select the class with the highest probability
CLASS.PROB = apply(P,c(1,2),sum)
CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing = T))[,1]
# for each datum, for the most likely class, most likely flip is the sum over the shift prob
FLIP.PROB = matrix(nrow=n.datum, ncol=2)
for(i in 1:n.datum)
{ FLIP.PROB[i,1] = sum(P[i,CLASS.MAX[i],,1])
FLIP.PROB[i,2] = sum(P[i,CLASS.MAX[i],,2])
}
FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1]
# compute weighted average shift for each datum
# for each datum, compute the weighted average shifting for the most likely class, for the most likely flip state
# i.g. a datum with 2 classes of prob 0.3 and 0.7 -> class 2 is the most likely
# a datum with 2 flip of prob 0.1 and 0.9 -> flip reverse is the most likely
# for class 2, shift states prob are 0.2, 0, 0.1 for forward flip state
# for class 2, shift states prob are 0.1, 0.1, 0.4 for reverse flip state
# bin.size is 10, the shift states are -10bp, 0bp, +10bp
# weighted shift average is done on the reverse flip state
# reverse (0.1*+10) + (0.1*0) + (0.4*-10) = 1 + -4 = -3bp
# weigthed average = -3bp
SHIFT.WEIGHT = vector(length=n.datum, mode="numeric")
for(i in 1:n.datum)
{ if(FLIP.MAX[i] == 1)
{ SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],,1] %*% SHIFT.DIFF.fw) }
else
{ SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],,2] %*% SHIFT.DIFF.rev) }
# SHIFT.FW = P[i,CLASS.MAX[i],,1] %*% SHIFT.DIFF.fw
# SHIFT.REV = P[i,CLASS.MAX[i],,2] %*% SHIFT.DIFF.rev
# SHIFT.WEIGHT[i] = round(SHIFT.FW + SHIFT.REV)
}
SHIFT.DIFF = CLASS.PROB = CLASS.MAX = FLIP.PROB = FLIP.MAX = SHIFT.FW = SHIFT.REV = NULL
return(SHIFT.WEIGHT)
}
# results plotting
# recompute class profile from array
em.class.profiles.from.prob <- function(data, p, shift, flip)
{ # Recomputes the class aggregation patterns from the probability array/matrice and the raw data.
# When the EM algorithm is run, it returns a matrice or an array with the given posteriori probabilities
# for each datum, with a given shift (if enabled) to belong to a given class (with a given shift, if enabled)
# as well as the classes aggregation pattern. This function allows to recompute the later.
#
# Returns a matrix
#
# data a matrice containing the data which were used to run the EM algorithm
# p a matrix, a 3D array or a 4D array (if shift and flip were allowed) containing
# the EM posteriori class probability for each datum of data.
# shift a boolean indicating whether shift was allowed when running the EM algorithm.
# flip a boolean indicating whether flip was allowed when running the EM algorith.
# check probability array format
if(!shift && !flip && (length(dim(p)) != 2))
{ stop(sprintf("Error! Unknown probability array format! Array without shift nor flip has %d/2 dimensions", length(dim(p)))) }
else if(shift && !flip && (length(dim(p)) != 3))
{ stop(sprintf("Error! Unknown probability array format! Array with shift has %d/3 dimensions", length(dim(p)))) }
else if(shift && flip && (length(dim(p)) != 4))
{ stop(sprintf("Error! Unknown probability array format! Array with shift and flip has %d/4 dimensions", length(dim(p)))) }
else if(!shift && flip && (length(dim(p)) != 3))
{ stop(sprintf("Error! Unknown probability array format! Array with flip and no shift has %d/3 dimensions", length(dim(p)))) }
else if(!shift && flip && (dim(p)[3] != 2))
{ stop(sprintf("Error! Unknown probability array format! Array with flip and no shift should have 3rd dimension equal to two (two flip states, %d detected)", dim(p)[3])) }
c = 0
# no shift no flip
if(!shift && !flip)
{ c = (t(p) %*% data)/colSums(p) }
# with shift only
else if(shift && !flip)
{ S = dim(p)[3]
L = dim(data)[2]-S
for(s in 1:S)
{ # c = c + (t(p[,,s]) %*% data[,k:(s+L-1)])/colSums(p[,,s])}
c = c + (t(p[,,s]) %*% data[,s:(s+L-1)])
}
c = c / apply(p, 2, sum)
}
# with flip only
else if(!shift && flip)
{ c = (t(p[,,1]) %*% data) +(t(p[,,2]) %*% t(apply(data,1,rev)))
c = c / apply(p,2,sum)
}
# with shift and flip
else if(shift && flip)
{ S = dim(p)[3]
L = dim(data)[2]-S
c = 0
for(s in 1:S)
{ c = c + (t(p[,,s,1]) %*% data[,s :(s+L-1)]) +
(t(p[,,s,2]) %*% t(apply(data[,(S-s+1):(S+L-s)], 1, rev)))
}
c = c / apply(p, 2, sum)
}
return(c)
}
em.plot.class.profile <- function(data.list, prob, plot.title, file, iter.n, shift.n=0, flip=FALSE, col.names, lty, lwd, col, row.masker=NULL)
{ # Simple modification of em.plot.class.profile.multiple(). This function allows to set the line color instead of drawing each class with a different
# color as em.plot.class.profile.multiple() does.
#
# Wrapper function to overlay EM class profiles of several datasets. Given the n sets of raw data and
# a class probability matrice/array as returned by EM algorithm, the class profiles will be recomputed
# (taking an eventual shift into account) and all the data classes ploted separately on a single figure.
#
# Returns Nothing
#
# data.list a list containing N data matrices (K rows x j columns) to be overlaid on the plot. The
# first matrix of the list MUST be the one on which the EM algorithm has been run. A null row
# filter will be set accordingly to this first matrix (rows corresponding to null rows in the 1st
# matrix will be removed from the following matrices).
# prob a matrix, a 3D array (if shift was allowed) or a 4D array (if shift and flip were allowed) containing
# the EM posteriori class probability for each datum in the original data.
# plot.title a string containing the plot title.
# file a string, the file where the plot will be saved.
# iter.n an int, the number of iteration of EM algorithm used to produce p and will be used as shifting.
# information to recompute the class profiles. Will also be used as part of the file name.
# shift.n an integer indicating the shift allowed when the EM algorithm was run. If shifting was not allowed,
# this argument should be set to 0.
# flip a boolean indicating whether flipping was allowed when the EM algorithm was run.
# col.names a vector of strings containing the name the columns of the data on which the EM algorithm was run.
# This will be used to label thick under the x-axis (taking a shift into account).
# lty a vector of N int, indicating which line type (lty) should be used to plot each of the N data types.
# lwd a vector of N int, indicating which line width (lwd) should be used to plot each of the N data types.
# col a vector of N colors (int, str, any usable color code) indicating the color of each line.
# row.masker a vector of at max K int specifying which data raw should not be drawn on the plots. If NULL, no
# row masking will be performed and all the data will be ploted.
# determine shift and flip
if(shift.n > 0) { shift <- TRUE }
else { shift <- FALSE }
if(flip) { flip <- TRUE }
else { flip <- FALSE }
# check probability array format
if(!shift && !flip && (length(dim(prob)) != 2))
{ stop(sprintf("Error! Unknown probability array format! Array without shift nor flip has %d/2 dimensions", length(dim(prob)))) }
else if(shift && !flip && (length(dim(prob)) != 3))
{ stop(sprintf("Error! Unknown probability array format! Array with shift has %d/3 dimensions", length(dim(prob)))) }
else if(shift && flip && (length(dim(prob)) != 4))
{ stop(sprintf("Error! Unknown probability array format! Array with shift and flip has %d/4 dimensions", length(dim(prob)))) }
else if(!shift && flip && (length(dim(prob)) != 3))
{ stop(sprintf("Error! Unknown probability array format! Array with flip and no shift has %d/3 dimensions", length(dim(prob)))) }
else if(!shift && flip && (dim(prob)[3] != 2))
{ stop(sprintf("Error! Unknown probability array format! Array with flip and no shift should have 3rd dimension equal to two (two flip states, %d detected)", dim(prob)[3])) }
# number of datasets present
n.data <- length(data.list)
# number of classes
K <- dim(prob)[2]
# number of elements (data row)
N <- dim(prob)[1]
if(length(lty) != n.data)
{ stop("Error! lty argument is too short or too long! Should be the same length as data.list.") }
if(length(lwd) != n.data)
{ stop("Error! lwd argument is too short or too long! Should be the same length as data.list.") }
if(length(row.masker) > dim(prob)[1])
{ stop("Error! row.maker is too long! It cannot contain more elements than the data matrix row number.") }
# determine xlab marks and thicks given shift
# compute from-to limits on xaxis with shift
n.bins <- length(col.names)
n.less.bin.left <- floor(shift.n/2)
n.less.bin.right <- ceiling(shift.n/2)
from.to <- col.names[n.less.bin.left:(n.bins-n.less.bin.right)]
# for plotting purposes, determine where the ticks and what ticks to put on x-axis
x.at <- seq(1,199-shift.n,11)
x.axt <- from.to[x.at]
# compute class profiles
profil.list <- list()
for(i in 1:n.data)
{ tmp.prob <- prob
tmp.data <- data.list[[i]]
if(!is.null(row.masker))
{ # prob is an array
if(shift.n)
{ tmp.prob <- tmp.prob[-row.masker,,] }
# prob is a matrix
else
{ tmp.prob <- tmp.prob[-row.masker,] }
tmp.data <- tmp.data[-row.masker,]
}
tmp.c <- em.class.profiles.from.prob(tmp.data, tmp.prob, shift=shift, flip=flip)
profil.list[[i]] <- tmp.c
tmp.prob <- NULL
tmp.data <- NULL
}
# compute class probability, 'apply(prob,2,sum)' works the same if prob is a matrix or an array
if(is.null(row.masker))
{ prob.vector <- apply(prob, 2, sum)/sum(prob) }
else
{ # prob is an array
if(shift.n)
{ tmp.prob <- prob[-row.masker,,] }
# prob is a matrix
else
{ tmp.prob <- prob[-row.masker,] }
prob.vector <- apply(tmp.prob, 2, sum)/sum(tmp.prob)
}
# plot
# address where the plot will be registered
png(file, width=600, height=900)
opar = par(mfrow=c(K+1,1))
y.lim <- c(0, 1.2)
x.len <- dim(profil.list[[1]])[2]
x <- c(1:x.len)
# data aggregation plot
# aggregation profile is the sum of the class profiles weighted by their probability
plot.main <- sprintf("%s\nAggregation (N=%d)", plot.title, N)
d <- prob.vector %*% profil.list[[1]]
d <- d / max(d)
plot(x, d, ylim=y.lim, main=plot.main, xlab="Approximated dist. to pattern [bp]",
ylab="Prop. weighted signal", xaxt='n', yaxt='n', bty='n', col=col[1], type = 'l', lty=lty[1], lwd=lwd[1])
axis(1, at=x.at, labels=x.axt, las=2)
axis(2, at=seq(0,1,0.25), labels=seq(0,1,0.25))
if(n.data > 1)
{ for(i in 2:n.data)
{ y <- prob.vector %*% profil.list[[i]]
y <- y / max(y)
lines(x, y, col=col[i], lty=lty[i], lwd=lwd[i])
}
}
# loop over classes -> 1 plot/class
for(i in 1:K)
{ j <- 1
d <- profil.list[[j]][i,] / max(profil.list[[j]][i,])
plot.main <- sprintf("class %d/%d", i, K)
plot(d, ylim=y.lim, main=plot.main, xlab="Approximated dist. to pattern [bp]",
ylab="Prop. weighted signal", xaxt='n', yaxt='n', bty='n', col=col[j], type = 'l', lty=lty[j], lwd=lwd[j])
# writes class overall probability
text(x.len-2, 1.1, labels=round(prob.vector[i],3), cex=1.5, col="black")
# draw axis
axis(1, at=x.at, labels=x.axt, las=2)
axis(2, at=seq(0,1,0.25), labels=seq(0,1,0.25))
# loop over matrix -> add data to current plot
if(n.data > 1)
{
for(j in 2:n.data)
{ x <- c(1:length(profil.list[[j]][i,]))
y <- profil.list[[j]][i,] / max(profil.list[[j]][i,])
lines(x, y, col=col[j], lty=lty[j], lwd=lwd[j])
}
}
}
par(opar)
dev.off()
}
# wrappers to run EM clustering
em.shape.shift.flip <- function(MATRIX, k=1, iter=10, shift=10, seeding="iterative")
{ # Wraper to run em.shape() with flip and with or without shift on a data matrix . The algorithm will
# be run times, assigning each datum to all classes with a certain probability. After the run the
# probability array(s) returned by em.shape() or any of its derivative function is(are) returned.
#
# Iterative seeding : the 1st class profile (flat class) is initialized to the mean of the data (mean per column). The
# data are then iteratively classified using 1 more class (resetting an untrained flat class at each iteration) until
# classes have been reached. Before adding an extra class, the algorithm is run times.
# Random seeding : prior probabilities for each datum to belong to all classes are randomly assigned, using a Beta
# distribution, at the beginning.
#
# Returns a list of numerical arrays. The list length is one if seeding is "random", if seeding is "iterative"
# where each element is a classification probability array with 1,2,..., classes.
# The array dimensions are (n,k,f) without shift or (n,k,s,f) with shift and flip where n is the number of
# datum, k the number of classes, s the number of allowed shift states and f=2 (2 shift states : non-flip,
# reversed).
#
# MATRIX a numerical matrix on which the classification is to be performed.
# k the number of classes to be optimized by the EM algorithm IN ADDITION to the flat class
# iter an int, the number of iteraction to optimized the current number of classes
# shift an int, the maximum shift allowed for each data row (in bins)
# seeding a string indicating the initializing strategy to use ("iterative" or "random"). Using
# "iterative" will lead to the use of an untrained flat class to which one class will be
# added a the time until reaching final classes. When using "random", the class probabilities
# are initialized randomly following a beta distribution.
p_list= list()
warning.flag = FALSE
if(seeding != "iterative" && seeding != "random")
{ stop("Error! Incorrect seeding method!") }
## EM probabilistic clustering ##
# without shifting
if(shift == 0)
{ N = dim(MATRIX)[1] # number of bins in each sample
L = dim(MATRIX)[2] # number of bins in the pattern
# using iterative partitioning with untrained flat class
if(seeding=="iterative")
{ K = k -1 # number of classes, with a flat class -> final number of class is k
# code adapted from http://ccg.vital-it.ch/var/sib_april15/cases/nair12/ctcf_ChipPart.html#hide4
c = colMeans(MATRIX) # contains the class profiles to optimize, contains less bins than sample because of the shifting
flat = matrix(data=mean(MATRIX), nrow=1, ncol=L) # contains the flat class profile which value is the mean of the data
q = q0 = dnorm(1:2, floor(1/2)+1, 1) / sum(dnorm(1:2, floor(1/2)+1, 1)) # contains the prior probability for each class, detrmines the number of classes when em is run
warning.flag = FALSE
for (m in 1:K)
{ c = rbind(flat,c)
q = rbind(q0/m,q)
q = q / sum(q)
for(i in 1:iter)
{ c[1,] = flat
em.return = em_shape_flip(c,q,MATRIX)
c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]];
em.return = NULL
# check whether a class has an overall probability of 0, if true algorithm stops
# this should be checked because it means that some prob values are now too small to be
# properly represented. Once it is 0, it will remain 0 and can lead to troubles if too
# many of them appears (like division by 0).
if(length(which(q == 0)) != 0)
{ warning.flag = TRUE
warning.msg = capture.output(q)
warning.msg[1] = paste0(sprintf("nNumber of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1])
warning.msg = paste(warning.msg, collapse = "\n")
warning(warning.msg)
break
}
}
if(warning.flag) { break }
p_list[[m]] = p
}
}
# using random seeding
else if(seeding=="random")
{ K = k # number of classes
p = matrix(nrow=N, ncol=K)
q = matrix(nrow=K, ncol=2)
for(m in 1:K)
{ p[,m] = rbeta(N, N**-0.5, 1) }
c = (t(p) %*% MATRIX) / colSums(p)
q[,1] = q[,2] = rep(1/K, K)
q = q / sum(q)
for(i in 1:iter)
{ em.return = em_shape_flip(c, q, MATRIX)
c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]]
em.return = NULL
# check whether a class has an overall probability of 0, if true algorithm stops
# this should be checked because it means that some prob values are now too small to be
# properly represented. Once it is 0, it will remain 0 and can lead to troubles if too
# many of them appears (like division by 0).
if(length(which(q == 0)) != 0)
{ warning.flag = TRUE
warning.msg = capture.output(q)
warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1])
warning.msg = paste(warning.msg, collapse = "\n")
warning(warning.msg)
break
}
}
p_list[[1]] = p
}
p = NULL
}
# with shifting
else
{ S = shift # maximum shift in bins
N = dim(MATRIX)[1] # number of bins in each sample
L = dim(MATRIX)[2] - S # number of bins in the pattern taking the shift into account
# using iterative partitioning with untrained flat class
if(seeding=="iterative")
{ # code adapted from http://ccg.vital-it.ch/var/sib_april15/cases/nair12/ctcf_ChipPart.html#hide4
K = k -1 # number of classes, with a flat class -> final number of class is k
c = colMeans(MATRIX[,(floor(S/2)+1):(floor(S/2)+L)]) # contains the class profiles to optimize, contains less bins than sample because of the shifting
flat = matrix(data=mean(MATRIX), nrow=1, ncol=L) # contains the flat class profile which value is the mean of the data
q = array(dim=c(1,S,2)) # contains the prior probability for each class, detrmines the number of classes when em is run
q[,,1] = dnorm(1:S,floor(S/2)+1,1)/sum(dnorm(1:S,floor(S/2)+1,1)) / 2 # each class starts with equal prob for both flip states
q[,,2] = dnorm(1:S,floor(S/2)+1,1)/sum(dnorm(1:S,floor(S/2)+1,1)) / 2 # DIVIDED BY TWO BECAUSE EACH CLASS HAS TWO FLIP STATES
q0 = q
warning.flag = FALSE
for (m in 1:K)
{ c = rbind(flat,c)
q.tmp = array(dim=c(m+1,S,2))
q.tmp[2:(m+1),,] = q
q.tmp[1,,] = q0/m
q.tmp = q.tmp/sum(q.tmp)
q = q.tmp
q.tmp = NULL
for(i in 1:iter)
{ q = reg_shift_flip(q)
c[1,] = flat
em.return = em_shape_shift_flip(c,q,MATRIX)
c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]];
em.return = NULL
# check whether a class has an overall probability of 0, if true algorithm stops
# this should be checked because it means that some prob values are now too small to be
# properly represented. Once it is 0, it will remain 0 and can lead to troubles if too
# many of them appears (like division by 0).
if(length(which(q == 0)) != 0)
{ warning.flag = TRUE
warning.msg = capture.output(q)
warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1])
warning.msg = paste(warning.msg, collapse = "\n")
warning(warning.msg)
break
}
}
if(warning.flag) { break }
p_list[[m]] = p
}
}
# using random seeding
else if(seeding=="random")
{ K = k # number of classes
p = matrix(nrow=N, ncol=K)
q = array(dim=c(K, S, 2))
for(i in 1:K)
{ p[,i] = rbeta(N, N**-0.5, 1) }
c = (t(p) %*% MATRIX[,(floor(S/2)+1):(floor(S/2)+L+1)]) / colSums(p)
for(m in 1:K)
{ q[m,,] = matrix(nrow=S, ncol=2, data=rep(1/K, S*2)) }
q = q / sum(q)
for(i in 1:iter)
{ q = reg_shift_flip(q)
em.return = em_shape_shift_flip(c, q, MATRIX)
c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]]
em.return = NULL
# check whether a class has an overall probability of 0, if true algorithm stops
# this should be checked because it means that some prob values are now too small to be
# properly represented. Once it is 0, it will remain 0 and can lead to troubles if too
# many of them appears (like division by 0).
if(length(which(q == 0)) != 0)
{ warning.flag = TRUE
warning.msg = capture.output(q)
warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1])
warning.msg = paste(warning.msg, collapse = "\n")
warning(warning.msg)
break
}
}
p_list[[1]] = p
}
p = NULL
}
return(p_list)
}