Fake Data Exercise for the Equivalence of Ward1 and Epsilon-alpha Clustering

11 min readAug 27, 2023
This is a companion post to one I’m still writing. It should be largely comprehensible on its own if you read the comments in the code, however. I guess I’ll just entice you with the basic plots of the results:
# I want a slightly more complicated example for Ward1/e(A,B)
# this is because I discovered a "trap for younger players" with my functions
# and also I'd like to compare the merits of Ward1 and Ward2 a little
# my alphabet toy example from earlier won't really cut it for that

library(dendextend)

set.seed(26082023)
n.e1 = 52
e1 = data.frame(x = rnorm(n.e1, 0, 0.25 / 10), y = rnorm(n.e1, 0, 0.25 / 10), col = "blue")
n.e2 = 71
e2 = data.frame(x = rnorm(n.e2, 0, 1 / 10), y = rnorm(n.e2, 0, 1 / 10), col = "red")
n.e3 = 64
e3 = data.frame(x = rnorm(n.e3, 4 / 10, 0.25 / 10), y = rnorm(n.e3, -1 / 10, 0.25/ 10), col = "green2")
n.e4 = 39
e4 = data.frame(x = rnorm(n.e4, 4 / 10, 0.25 / 10), y = rnorm(n.e4, 1.25 / 10, 0.25 / 10), col = "sienna")
n.e5 = 112
e5 = data.frame(x = runif(n.e5, 2 / 10, 6.064 / 10), y = runif(n.e5, -2.288 / 10, 2 / 10), col = "turquoise1")

e.gen = rbind(e1, e2, e3, e4, e5)
e.df = e.gen[, 1:2]

plot(x = e.gen$x, y = e.gen$y, col = e.gen$col, pch = 16, asp = 1,
     main = "Simulation Study Coloured by Actual Generative Process")

# running a Ward1 clustering

e.epsalf = hclust(dist(e.df), method = "ward.D")

# plotting the dendrogram
# I want to have a few identifiable observations so I need to create a label vector
# the code would be much more straightforward otherwise just plot(e.epsalf, hang = -1)
# if I didn't create a label vector it would try to print EVERY row number
# that is not only illegible but also very ugly

e.lab = rep("", dim(e.df)[1])
set.seed(14145236)
e.labgen = c(sample(1:n.e1, 5),
             sample(1:n.e2, 5) + n.e1,
             sample(1:n.e3, 5) + n.e1 + n.e2,
             sample(1:n.e4, 5) + n.e1 + n.e2 + n.e3,
             sample(1:n.e5, 5) + n.e1 + n.e2 + n.e3 + n.e4)
e.lab[e.labgen] = e.labgen


plot(e.epsalf, labels = e.lab, hang = -1, cex = 0.65,
     main = "Full Dendrogram for Example with k = 5 Clusters")
rect.hclust(e.epsalf, k = 5)
# ideally we select clusters where the branch lengths are all long
# k = 3 or k = 4 would be better choices than k = 5
# also: these heights are essentially linear information if they're actually e-distances
# Romesburg was suggesting non-linear information creates a visual bias


# the relative e-distance between clusters is such that the full plot was unreadable
# we can use package dendextend to zoom in on the dendrogram

# first we need the dendrogram information in a dendrogram object
e.dend = as.dendrogram(e.epsalf)

# for some reason the label vector is in a different order
# so, I need to recreate my label vector
testcase = e.lab[labels(e.dend)]

# and how I was doing the colours didn't work either

epsalf.grps = cutree(e.epsalf, k = 5)
# this creates a vector of cluster IDs for each observation
# cluster IDs are numeric, e.g. this might start 1, 2, 1, 1, 3, 4, 4, 4, 3, 5...
# (in reality it's 1 for ages)
epsalf.cols = c("tomato", "plum", "steelblue", "peru", "yellowgreen")[epsalf.grps]
# this repeats the first colour whenever the Cluster ID is 1, the second when it's 2 etc.
e.dend = color_labels(e.dend, col = epsalf.cols[labels(e.dend)])
# this tells the dendrogram how to colour the labels

# and now we can plot
e.dend %>% 
  set("labels", testcase) %>% # setting the labels to my label vector
  set("labels_cex", .8) %>% # making the labels smaller
  plot(ylim = c(0, 2.6), main = "Zoomed in Dendrogram with k = 5 Clusters")

# note that I have used a different set of colours
# this is because I don't want to imply that hclust() is trying to recover the actual groups
# if it was, clustering would be a classification technique, it's not
# however, we might as well see how well the actual groups were recovered


# Comparing the clusters to the generative groups


par(mfrow = c(1, 2))
# so the two plots are made in the same window



plot(x = e.gen$x, y = e.gen$y, col = e.gen$col, pch = 16, asp = 1,
     main = "Coloured by Actual Generative Process")
plot(x = e.gen$x, y = e.gen$y, col = epsalf.cols, pch = 16, asp = 1,
     main = "Coloured by Epsilon-Alpha Clustering\n(for k = 5)")

# we can see that if we've got Epsilon-alpha (1) clustering in R
 # it has not separated the red and blue groups, which wouldn't be interesting except...
 # Szekely and Rizzo specifically said it could distinguish same mean clusters
 # I confess, I'm not really sure how they envisioned that to work
 # maybe they meant something like an inner and outer circle
# the brown (sienna) and green groups have been separated
# but there's overlap with the turquoise group, which is also split in three



# let's work with the heights now
# to make this easier, lets replot the dendrograms

e.dend %>% 
  set("labels", testcase) %>% # setting the labels to my label vector
  set("labels_cex", .8) %>% # making the labels smaller
  plot(main = "Full")
e.dend %>% 
  set("labels", testcase) %>% # setting the labels to my label vector
  set("labels_cex", .8) %>% # making the labels smaller
  plot(ylim = c(0, 2.6), main = "Zoomed In")



# there's a generic function for the height of the lowest common node of two observations
# cophenetic()
# I find it easier to use as.matrix(cophenetic())[i, j] however

# per the earlier dendrograms we know the following [i,j] pairs are in different clusters

as.matrix(cophenetic(e.epsalf))[61, 338]
 # tomato and steelblue
 # they're on opposite sides of the final merge
 # i.e. the lowest common node is the top of the tree
as.matrix(cophenetic(e.epsalf))[338, 285]
 # steelblue and yellowgreen
 # these two are on opposite sides of the penultimate merge (agglomeration)
 # i.e. the lowest common node is the second highest node in the whole tree
as.matrix(cophenetic(e.epsalf))[213, 214]
 # peru (brownish) and plum
 # and these are opposite sides of the fourth to last merge
 # i.e. the lowest common node is the fourth highest node in the tree

# to use my function for e(A,B) we need to know the clusters these observations belong to
# this is easy, too

epsalf.grps[c(61, 338, 285, 213, 214)]

# let's introduce my function 

edist = function(data, indices1, indices2){
  # this function returns the e-distance between two clusters
  # it takes as inputs:
    # the raw values of the dataset that was clustered
    # a logical vector indicating the observations in cluster 1
    # a logical vector indicating the observations in cluster 2
  # I believe e-distance to stand for energy distance
  
  
  if(any(class(indices1) != "logical", class(indices2) != "logical")){
    stop("At least one index vector is not logical")
  }else{
    
    # the energy distance is a weighted sum of three double sums
    # the weightings are calculated using the number of observations in each cluster
    
    n1 = sum(indices1)
    n2 = sum(indices2)
    
    # the sums that are summed are Euclidean vector norms of the forms: 
      # (1) ||a_i - b_j||
        # a_i represents all observations that are members of cluster one
        # b_j represents all observations that are members of cluster two
      # (2) ||a_i - a_j||
        # a_i as in (1), i.e. i =  1, 2, ... , n1
        # a_j also represents /all/ members of a, i.e. j = 1, 2, ... , n1
      # (3) ||b_i - b_j||
        # with b_i and b_j being homologous with a_i and a_j in (2)
    
    # it happens that the definition of the Euclidean distance between vectors p, q is:
      # d(p,q) = ||p - q||
    # i.e. the Euclidean vector norm of the vector difference
    # I exploit this fact to allow dist() to calculate the Euclidean vector norms
    
    mill = as.matrix(dist(data))
    # the as.matrix() form allows the indices to do all the work
    
    sum1 = sum(mill[indices1, indices2])
    sum2 = sum(mill[indices1, indices1])
    sum3 = sum(mill[indices2, indices2])
    
    # and finally the calculation:
    
    edist = ((n1 * n2) / (n1 + n2)) * (
      ((2 / (n1 * n2)) * sum1) -
        ((1 / (n1^2)) * sum2) -
        ((1 / (n2^2)) * sum3)
      )
    
    data.frame(edist, n1, n2, sum1, sum2, sum3)
  }
}

# now my function records the e(A,B) for a PROPOSED merge
# not every possible merge actually happened

# the peru and plum clusters were actually merged
# these turn out to be clusters 4 and 2
# per cophenetic() the height should be 2.993581
# let's apply my function

edist(e.df,
      epsalf.grps == 4,
      epsalf.grps == 2)
# voila

# in contrast, the steelblue and yellowgreen groups were NOT merged
# these are clusters 3 and 5
# per cophenetic() the height should be 12.16914
# if we use my function like we just did, we'll get the WRONG height

edist(e.df,
      epsalf.grps == 3,
      epsalf.grps == 5)
# this really is the e distance between clusters 5 and 3
# it's just it was never the smallest e distance when the agglomerations were being made
# to recover the cophenetic distance between the clusters we must reflect that process
# the steelblue cluster was merged with the combo of the plum, peru and yellowgreen clusters

edist(e.df,
      epsalf.grps == 3,
      epsalf.grps == 5 | epsalf.grps == 4 | epsalf.grps == 2)
# and that is the same value obtained from cophenetic()

# this is the trap for younger players that I was talking about
# my function calculates the e distances between vectors/matrices/data
# Epsilon-alpha is a hagglo so there is a hierarchy to the mergers
# e distance is only the height/cophenetic distance for clusters that were ACTUALLY merged
# in other words, if comparing A with B 
 # when B is a subcluster of C 
 # and A and C were merged
 # and C is the result of the merger of B and D
 # my function requires inputs A and C, to calculate the cophenetic distance of A and B
 # this is equivalent to comparing A with the merger of B and D
# if cluster A and B were not merged directly then
 # the e(A,B) distance is not the cophenetic distance of clusters A and B



# let's now consider the equivalence of Epsilon-alpha (2) clustering with Ward Clustering
# to do this we will modify my edist() function so it creates e^2(A,B) distances

e2.dist = function(data, indices1, indices2){
  # this function returns the e-distance between two clusters
  # it takes as inputs:
    # the raw values of the dataset that was clustered
    # a logical vector indicating the observations in cluster 1
    # a logical vector indicating the observations in cluster 2
  # I believe e-distance to stand for energy distance
  
  
  if(any(class(indices1) != "logical", class(indices2) != "logical")){
    stop("At least one index vector is not logical")
  }else{
    
    # the energy distance is a weighted sum of three double sums
    # the weightings are calculated using the number of observations in each cluster
    
    n1 = sum(indices1)
    n2 = sum(indices2)
    
    # the sums that are summed are Euclidean vector norms of the forms: 
      # (1) ||a_i - b_j||
        # a_i represents all observations that are members of cluster one
        # b_j represents all observations that are members of cluster two
      # (2) ||a_i - a_j||
        # a_i as in (1), i.e. i =  1, 2, ... , n1
        # a_j also represents /all/ members of a, i.e. j = 1, 2, ... , n1
      # (3) ||b_i - b_j||
        # with b_i and b_j being homologous with a_i and a_j in (2)
    
    # it happens that the definition of the Euclidean distance between vectors p, q is:
      # d(p,q) = ||p - q||
    # i.e. the Euclidean vector norm of the vector difference
    # I exploit this fact to allow dist() to calculate the Euclidean vector norms
    
    # of course, we now want to square those distances
    
    mill = as.matrix(dist(data))^2
    # the as.matrix() form allows the indices to do all the work
    
    sum1 = sum(mill[indices1, indices2])
    sum2 = sum(mill[indices1, indices1])
    sum3 = sum(mill[indices2, indices2])
    
    # and finally the calculation:
    
    ((n1 * n2) / (n1 + n2)) * (
      ((2 / (n1 * n2)) * sum1) -
        ((1 / (n1^2)) * sum2) -
        ((1 / (n2^2)) * sum3)
      )
  }
}

# per Murtagh and Legendre we know 
 # hclust(dist(e.df)^2, method = "ward.D"), and
 # hclist(dist(e.df), methpd = "ward.D2")
 # are equivalent morphologically, with proportional heights. 
# I don't want to recreate their result (go read either the 2011 or 2014 paper)
 # they have reproducible examples and R code
 # just adapt it for the new version of hclust()
# I want to show that the heights of "ward.D2" are e^2(A,B) distances
# obviously the same trap for younger players applies


# running a Ward2 clustering with the same example we've just been using

e.ward = hclust(dist(e.df), method = "ward.D2")

# let's firstly consider the Ward result versus Epislon-alpha (1) result
# and then establish the equivalence of the heights with e^2(A,B)

library(latex2exp)

plot(e.ward, labels = e.lab, hang = -1, cex = 0.65,
     main = TeX('$Ward\\, Clustering$'))
plot(e.epsalf, labels = e.lab, hang = -1, cex = 0.65,
     main = TeX('$\\textit{Ɛ}^{(1)}\\, Clustering$'))
# obviously the heights are different
# the key thing here is the different morphology, i.e. pattern of nodes and branches
# honestly, I think Ward is saying k = 3 or k = 8
# though at this scale, Epsilon-alpha (1) looks more like k=3
 # with k=4 and k=5 being equally valid, rather than k=4 looking better

# let's now plot the points coloured by each system

e.ward.grps = cutree(e.ward, k = 5)
e.ward.cols = c("royalblue1", "darkorchid", "orange", "darkgoldenrod", "seagreen")[e.ward.grps]
# I again use a distinct set of colours


plot(x = e.gen$x, y = e.gen$y, col = e.ward.cols, pch = 16, asp = 1,
     main = TeX('$Ward\\, Clustering$'))
plot(x = e.gen$x, y = e.gen$y, col = epsalf.cols, pch = 16, asp = 1,
     main = TeX('$\\textit{Ɛ}^{(1)}\\, Clustering$'))
# honestly, Epsilon-alpha (1) is doing a better job
# but definitely in a "less wrong" sense than a "more right" one


# now we can return to the question of e^2(A,B) heights

# set up for coloured labels
e.denward = as.dendrogram(e.ward)
samecase = e.lab[labels(e.denward)]
e.denward = color_labels(e.denward, col = e.ward.cols[labels(e.denward)])


e.denward %>% 
  set("labels", samecase) %>% # setting the labels to my label vector
  set("labels_cex", .8) %>% # making the labels smaller
  plot(, main = "Full (Ward)")
e.denward %>% 
  set("labels", samecase) %>% # setting the labels to my label vector
  set("labels_cex", .8) %>% # making the labels smaller
  plot(ylim = c(0, 1.3), main = "Zoomed In (Ward)")


# again, we can use cophenetic() to get the actual heights

as.matrix(cophenetic(e.ward))[114, 292]
 # darkorchid (purple) and seagreen
 # they're on opposite sides of the final merge
 # i.e. the lowest common node is the top of the tree
as.matrix(cophenetic(e.ward))[114, 31]
 # darkorchid and royalblue
 # these two are on opposite sides of the highest merge in the left hand side of the tree
 # it looks like their lowest common node is the fourth highest merge
as.matrix(cophenetic(e.ward))[151, 188]
 # darkgoldenrod and orange
 # they're on opposite sides of the highest merge on the right hand side of the tree
 # i.e. the second highest merge in the tree is their lowest common node

# let's now find out the groups

e.ward.grps[c(114, 292, 31, 151, 188)]

# okay so let's now compare the e^2(A,B) distances to the cophenetic distances

# remembering the trap for younger players we need marges that actually happened
# of the three cophenetic distances we calculated only one was for an actual merge
# that was for darkorchid and royalblue, i.e. clusters 2 and 1
# the target value is 1.032443

sqrt(e2.dist(e.df,
      e.ward.grps == 2,
      e.ward.grps == 1))
# and there we have it
# the height of the Ward clustering method are the sqrt of the e^2(A,B) distances

# we can check this by comparing the merger of the right hand side
# this is meant to be 2.001136

sqrt(e2.dist(e.df,
      e.ward.grps == 4,
      e.ward.grps == 3 | e.ward.grps == 5))
# voila!

# I guess here endeth the lesson
Fake Data Exercise for the Equivalence of Ward1 and Epsilon-alpha Clustering

Written by FrameworkisDigimon