Hecuba (C)

Published

October 30, 2025

Code

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  comment = NA
)

library(conflicted)
library(tidyverse)
library(patchwork)
conflict_prefer_all("dplyr", quiet = TRUE)
library(here)
library(scales)
library(hrbrthemes)
library(kableExtra)
library(tidymodels)
library(ggrepel)
library(cluster)
library(ggdendro)
library(gt)
library(FactoMineR)
library(broom)
library(wesanderson)
library(treemapify)


theme_set(theme_ipsum(base_family = "Aktiv Grotesk"))
theme_update(
  panel.grid.minor = element_line(linetype = "dotted", linewidth = 0.5),
  panel.grid.major = element_line(linetype = "dotted", linewidth = 0.5)
)

pal <- wes_palette(name = "BottleRocket2")

df <- read_csv(here("data/clean/hecuba-c.csv")) 


npc <- function(nps) {
  if(!is.numeric(nps)) {
    abort("Variable must be numeric")
  }
  
  nps = as.integer(nps)
  
  if(any(nps < 0L) || any(nps > 10L)) {
    abort("Variable must be greater than or equal to 0 and less than or equal to 10")
  }
  
  npc = case_when(
    nps <= 6 ~ "Detractor",
    nps > 6 & nps <= 8 ~ "Passive",
    TRUE ~ "Promoter"
  )
  
  npc
}


nps <- function(nps) {
  promoters = sum(nps > 8) / length(nps)
  detractors = sum(nps < 7) / length(nps)
  
  promoters - detractors
}

SMALL_FONT_SIZE <- 14
MAIN_LINE <-  1.5
MAIN_POINT <- 5

# The code below is a function I use to format my tables. It relies on the
# gt() package. 
lv_tab_style <- function(gt_object) {
  gt_object |> 
  fmt_number(decimals = 0, use_seps = TRUE) |> 
  tab_style(
    style = list(
      cell_borders(sides = c("top", "bottom"), color = "#cfcfcf")
    ),
    locations = list(cells_body(), cells_stub())
  ) |> 
  tab_style(
    style = list(
      cell_text(color = "#000000")
    ),
    locations = cells_body()
  ) |> 
  tab_style(
    style = list(
      cell_borders(sides = "top", weight = px(3), color = "black"),
      cell_text(weight = "bold", color = "black")
      ),
    locations = list(cells_stub(), cells_column_labels())
  ) |> 
  tab_style(
    style = list(
      cell_borders(sides = "bottom", weight = px(1), color = "black")
    ),
    locations = list(cells_stub(), cells_column_labels())
  ) |> 
  cols_width(
    everything() ~ px(70)
  ) |> 
  tab_options(
    table.align = "left",
    table_body.hlines.color = "#000000", 
    table_body.border.top.style = "#000000", 
    heading.border.bottom.color = "#000000", 
    heading.border.bottom.width = px(1),
    column_labels.border.bottom.color = "#000000",
    column_labels.border.bottom.width = px(1),
    column_labels.font.size = px(12),
    table_body.border.bottom.color = "#000000",
    table_body.border.bottom.width = px(2)
  )

}

# Basic bar chart template
base_bar <- function(dat, x, y, fill, round=0, title="") {
  ggplot(dat, aes(x = {{x}}, y = fct_rev({{y}}), fill = {{y}})) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = round({{x}}, round)), hjust = 0, fontface="bold") +
  scale_fill_viridis_d() +
  theme(
    axis.text.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  labs(
    title = {{title}},
    x="",
    y="Cluster"
  )
}

1. Data Inspection

Numeric Variables

Code

NUMERIC_VARIABLES <- df |>
  select(locations:per_cap, q2_1:q2_5, q4) |> 
  rename(
    `q2_1: Mission` = 'q2_1',
    `q2_2: Coffee` = 'q2_2',
    `q2_3: Store Comfort` = 'q2_3',
    `q2_4: Food Variety` = 'q2_4',
    `q2_5: WiFi` = 'q2_5',
    `q4: NPS` = 'q4'
  ) 

CATEGORICAL_VARIABLES <- df |> 
  select(unlimited:wifi, q1, q3) |>   
  rename(
    `q1: Local` = `q1`,
    `q3: Follow on Social Media` = `q3`
    )

  

NUMERIC_VARIABLES |> 
  gather(Variable, score) |> 
  group_by(Variable) |> 
  summarise(
    Median = median(score), 
    Mean = mean(score), 
    SD = sd(score),  
    Min = min(score), 
    Max = max(score), 
    `NA` = sum(is.na(score))
    ) |> 
  gt() |> 
  lv_tab_style() |> 
  cols_width(Variable ~ px(200)) |> 
  fmt_number(
    columns = Mean:Max,
    decimals = 2
  )

Variable	Median	Mean	SD	Min	Max
food	0	0.41	0.42	0.00	1.00
locations	2	2.74	2.10	1.00	13.00
origination	110	183.66	196.07	1.00	784.00
per_cap	5	4.62	5.43	−13.47	21.18
q2_1: Mission	3	3.08	1.20	1.00	5.00
q2_2: Coffee	3	2.95	1.15	1.00	5.00
q2_3: Store Comfort	3	3.35	1.06	1.00	5.00
q2_4: Food Variety	3	2.93	1.10	1.00	5.00
q2_5: WiFi	3	2.96	1.17	1.00	5.00
q4: NPS	8	8.18	1.41	3.00	10.00
recency	8	8.97	5.30	1.00	28.00
tx	39	89.34	120.27	1.00	691.00

Categorical Variables

Code

CATEGORICAL_VARIABLES |> 
  gather(Variable, score) |> 
  group_by(Variable) |> 
  summarise(pct = mean(score)) |> 
  gt() |> 
  lv_tab_style() |> 
  fmt_percent(
    columns = everything(),
    decimals = 0
  ) |> 
  cols_width(Variable ~ px(300))

Variable	pct
q1: Local	71%
q3: Follow on Social Media	36%
roaster	26%
unlimited	18%
wifi	44%

Right away, I notice that Q2 in the survey data doesn’t seem to be very differentiated. All of the scores gravitate towards a median of 3. Histograms suggest that the data follows a fairly normal distribution.

Code

df |> 
  select(starts_with("q2")) |> 
  gather(question, score) |> 
  ggplot(aes(x=score)) + 
  geom_histogram(binwidth = 1, color = "white", fill = pal[5]) + 
  facet_wrap(~ question)

We can also check to see if there is a statistically significant difference between each of these scores using ANOVA and Tukey's HSD (Honestly Significant Difference). This analysis considers each pair of variables. Grey points are not statistically significant. From a statistical perspective, there is no difference in the mean scores between WiFi and Food Variety, WiFi and Coffee, and Food Variety and Coffee.

Code

base_q2 <- df |> 
  select(starts_with("q2_")) |> 
    rename(
    Mission = 'q2_1',
    Coffee = 'q2_2',
    `Store Comfort` = 'q2_3',
    `Food Variety` = 'q2_4',
    WiFi = 'q2_5'
  ) |> 
  gather(metric, score)

a1 <- aov(score ~ metric, data = base_q2)
hsd_1 <- TukeyHSD(a1)
hsd_tidy <- tidy(hsd_1)


hsd_tidy |> 
  mutate(
    Significance = ifelse(adj.p.value < 0.05, "Significant", "Not Significant"),
    Significance = factor(Significance)
    )  |> 
  ggplot(aes(y=contrast, color = Significance)) +
  geom_segment(aes(xend = conf.low, x = conf.high, yend=contrast), show.legend = FALSE) +
  geom_point(aes(x=estimate), size = 3) +
  geom_text(aes(x=estimate, label = round(estimate, 2)), fontface = "bold", nudge_y = .4, color = "grey60") +
  geom_vline(xintercept = 0, color = pal[2], alpha = 0.8, linetype = "dashed") +
  scale_color_manual(values = c("grey80", "#FF2244")) +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(linetype = "dotted", linewidth = .5),
    panel.grid.major.x = element_line(linetype = "dotted", linewidth = .4)
  ) +
  labs(
    title = "Tukey's HSD"
  )

It is also helpful to us to turn our attention to two of the most important drivers of customer-centricity: profitability and satisfaction. We can start with satisfaction, by considering the NPS data. The overall NPS is 34%, but we glean a little more insight by unpacking it’s component parts.

Code

df |> 
  mutate(npc = npc(q4)) |> 
  count(npc) |> 
  ungroup() |> 
  mutate(
    p = prop.table(n)
    ) |> 
  ggplot(aes(x=p, y=factor(1), fill=fct_rev(npc))) +
  geom_bar(stat = "identity", position = position_stack()) +
  geom_text(aes(label = percent(p, 1), color = fct_rev(npc)), size = 5, fontface = "bold", position = position_stack(vjust = 0.5), show.legend = FALSE) +
  scale_fill_manual(values = pal[c(3, 1, 2)]) +
  scale_color_manual(values = c("white", "black", "white")) +
  guides(fill = guide_legend(reverse = TRUE, title = "Category")) +
  theme(
    legend.position = "top",
    legend.justification = "left",
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
  ) +
  labs(
    title = "NPS Categorization",
    x="", y=""
  )

As for profitability, the per_cap variable can help. It measures the average transaction contribution. In our descriptive summary above, we saw that some customers cost us as much as -$13.47 every time they transact. Going back to the whale curve, we can see that about 21% of customers in this data set “tax” our potential earnings.

Code

whale_plot <- function(data, x, y) {
  ggplot(data, aes(x={{x}}, y={{y}})) +
  geom_line(group = 1, linewidth = 1.25, color = "#555351") +
  scale_y_continuous(limits = c(0,1.15), expand = c(0,0), labels = percent_format(accuracy = 1)) +
  scale_x_continuous(limits = c(0,1.05), expand = c(0,0), labels = percent_format(accuracy = 1)) +
  theme(
    axis.line.x.bottom = element_line(color = "#BBB7B4", arrow = grid::arrow(length = unit(0.3, "cm"), ends = "last"), linewidth = 1),
    axis.line.y.left = element_line(color = "#BBB7B4", arrow = grid::arrow(length = unit(0.3, "cm"), ends = "last"), linewidth = 1),
    axis.text.x = element_text(
      face = "bold", 
      margin = margin(10, 0, 0, 0, unit = "pt")
      ),
    axis.text.y = element_text(
      face = "bold", 
      margin = margin(0, 10, 0, 0, unit = "pt")
      ),
    axis.title.y = element_text(size = 12),
    axis.title.x = element_text(size = 12),
    panel.grid = element_blank(),
    plot.title = element_text(margin = margin(0,0,3,0, "pt"), size = 28),
    plot.subtitle = element_text(size = 18, margin = margin(0, 0, 36, 0, unit = "pt"))
  )  
}

total_profit <- sum(df$per_cap)
total_customers <- nrow(df)

whaler <- df %>% 
  arrange(desc(per_cap)) %>% 
  mutate(
    cum_profit = cumsum(per_cap),
    pct_profit = cum_profit / total_profit,
    rank = row_number(),
    pct_customers = row_number() / total_customers
  )

max_profit_pos = which(whaler$cum_profit == max(whaler$cum_profit)) / max(whaler$rank)
taxers <- 1 - max_profit_pos

plot_whaler <- whale_plot(whaler, x=pct_customers, y=pct_profit) +
  geom_hline(yintercept = 1, color = "#C1272D", linewidth = 0.8, linetype = "dotted") +
  annotate("text", x=0.05, y=0.07, label = dollar(max(whaler$per_cap), 1), hjust = 0, fontface = "bold", size = 6) +
  annotate("text", x=0.89, y=0.91, label = dollar(whaler$cum_profit[nrow(whaler)], 1), hjust = 0, fontface = "bold", size = 6) +
  labs(
    title = "Whale Curve",
    subtitle = str_glue("{percent(taxers, 1)} of customers tax profit"),
    x="Percent of Customers (Ranked)", y="Percent of Profitability"
  )

plot_whaler

2. Preparing the data for cluster analysis

Our goal is to look for statistically significant segments or clusters in the data. We have a mixed dataset of numeric and categorical variables. In our numeric set of variables we also have some high contrasts (variables with very different scales). For example, origination has a maximum value of 784 and a minimum value of 1, while locations has a range of 1-13. These high variances can influence k-means, so it is wise to pre-process the data. One approach is to simply scale all of the numeric variables. While this approach is valid, plotting your results can be challenging with so many variables. For this reason, another approach is to pre-process the data using PCA. This has two advantages. First, it provides you with a new set of uncorrelated variables, but the second advantage is that it can often allow you to reduce the dimensionality of your dataset to just two principal components. These are easier to plot and visualize.

Here’s what I mean by visualizing with PCA:

Code

pca_data <- df |>
  select(-id) |>
  prcomp(scale. = TRUE)

pct_exp <- pca_data |>
  tidy(matrix = "eigenvalues") |>
  pluck("cumulative", 2)

pca_tidy <- pca_data |>
  tidy(matrix = "rotation")

pca_augment <- pca_data |>
  augment(df)

# Function to plot PCA
pca_plot <- function(data, title, pce) {
  ggplot(data, aes(x=PC1, y=PC2)) +
  geom_segment(
    xend = 0,
    yend = 0,
    arrow = arrow(ends = "first", length = unit(6, "pt"))
  ) +
  geom_label_repel(aes(label = column)) +
  labs(
    title = title,
    caption = str_glue("{percent(pce, 1)} of data explained")
  )
}

pca_tidy |>
  pivot_wider(
    names_from = "PC",
    names_prefix = "PC",
    values_from = "value"
  ) |>
  pca_plot(title = "PCA of All Data", pct_exp)

In the plot above, we obtain a high-level overview of our data. The arrows indicate the direction the data moves relative to a central point. This allows us to analyze which data are most correlated. For example, q3 (Follow on Social) seems to be inversely correlated with locations. To make this plot, I used only the first and second principal components (PC1 and PC2). If I used more components, I couldn’t have made the plot. The trade-off is precision. You’ll note that these two principal components explain only 33% of the data.

The biggest issue with the plot above isn’t the lack of precision, but the fact that I used PCA on a dataset mixed with numeric and categorical variables. Categorical variables don’t really satisfy the conditions necessary for PCA. The unlimited variables is binary. A customer either is a member or they are not. There is no point between zero and one in reality. This nuance can lead us astray when we input our pre-processed data into k-means for clustering.

The solution I’ve chosen is an algorithmic technique known as Factor Analysis for Mixed Data. This algorithm runs the numeric variables through PCA and the categorical variables through Multiple Correspondence Analysis (MCA), the categorical equivalent of PCA. It then takes the resulting numeric output and creates new factors (similar to components) that can be used for clustering and other analysis. The resulting plots are not quite as satisfying, but they provide several views of the data, the last looking very similar to PCA.

Code

k4_fct <- df |> 
  select(-id) |> 
  mutate(
    unlimited = factor(unlimited, levels = 1:0, labels = c("Unlimited", "Not Unlimited")),
    roaster = factor(roaster, levels = 1:0, labels = c("Roaster", "Not Roaster")),
    wifi = factor(wifi, levels = 1:0, labels = c("WiFi", "Not WiFi")),
    local = factor(q1, levels = 1:0, labels = c("Local", "Not Local")),
    social = factor(q3, levels = 1:0, labels = c("Social Promoter", "Not Social Promoter"))
    ) |> 
  select(-q1, -q3)

k4_fct_scaled <- k4_fct |> 
  mutate(across(c(locations:per_cap, q2_1:q2_5, q4), \(x) as.numeric(scale(x))))
  

# 
# fct_dim <- famd$var$coord[, 1:2]
# fct_names <- rownames(famd$var$coord)
# 
# fct_dims <- fct_dim |> 
#   as_tibble() |> 
#   bind_cols(column = fct_names) |> 
#   relocate(column, .before = Dim.1) |> 
#   rename(PC1 = Dim.1, PC2 = Dim.2)
# 
# fct_dims |> 
#   pca_plot(title = "FAMD of All Data", pce = pct_exp/100)

famd <- FAMD(k4_fct_scaled, graph = TRUE)

Don’t worry if you don’t follow all of the graphs above. They are just different views of the data after running the factoring algorithm. You might note the PCA plot at the bottom. It is only plotting the numeric variables. We can see that the directions are very similar to the PCA plot I previously constructed with the binary variables treated as numeric, the only difference is that the categorical variables have been removed.

3. Clustering with K-Means

I chose to begin my analysis with k-means. I am going to feed the k-means function in my stats software with the first two factor scores that resulted from my famd (Factor Analysis of Mixed Data). This will cost us some precision, but will make analysis easier.

The scree plot below suggests an elbow of three clusters, but I’m interested in corroborating (or not) the qualitative research findings. A four-cluster solution seems viable.

Code

factored_df <- famd$ind$coord[, 1:2] |>
  as_tibble() |>
  janitor::clean_names()

factored_df_aug <- df |>
  bind_cols(factored_df)

set.seed(2121)
kclusts <- tibble(k = 1:8) |> 
  mutate(
    clust = map(k, ~ kmeans(factored_df, centers = .x, nstart = 25)),
    tidied = map(clust, tidy),
    glanced = map(clust, glance),
    augmented = map(clust, augment, factored_df_aug)
  )

pct_exp <- famd$eig |> 
  as_tibble() |> 
  janitor::clean_names() |> 
  pluck("cumulative_percentage_of_variance", 2)

kclusts |> 
  unnest(glanced) |> 
  ggplot(aes(x=k, y=tot.withinss)) +
  geom_line(aes(group=1)) +
  geom_point(size = 4) +
  labs(
    title = "Scree Plot",
    subtitle = "Using first two factor dimensions",
    caption = str_glue("{percent(pct_exp / 100, 1)} of data explained")
  )

The argument in favor of a three-cluster solution grows stronger when we map the cluster centers and tag the data points assigned to them. However, a four-cluster solution is not unwieldy. For discussion purposes, I’m going to stay with a four-cluster solution, but you may have chosen three clusters. It will be interesting to compare and contrast our results.

Code

kclusts |> 
  unnest(augmented) |> 
  ggplot(aes(x = dim_1, y = dim_2)) +
  geom_point(aes(color = .cluster), alpha = 0.15, size = 1, show.legend = FALSE) +
  facet_wrap(~ k, nrow = 2) +
  scale_color_viridis_d() +
  geom_point(data = kclusts |> unnest(tidied), size = 5, shape = 13, color = "#FFFFFF", show.legend = FALSE) +
  labs(
    title = "Cluster Centers",
    caption = str_glue("{percent(pct_exp / 100, 1)} of data explained")
  )

Customer performance metrics

Once all the data has been assigned to a cluster, I find it helpful to consider the big picture. In this case, how does profitability and satisfaction differ by cluster? The graph below suggests that K4 is the most satisfied but the least profitable. K1 is the most profitable but the least satisfied.

Code

k4 <- kclusts |> 
  unnest(augmented) |> 
  filter(k == 4)

K4_SCALE <- k4 |> 
  group_by(.cluster) |> 
  summarise(n = n(), .groups = "drop") |> 
  mutate(p = prop.table(n))

plot_core <- k4 |> 
  group_by(.cluster) |> 
  summarise(
    N = n(),
    Profit = mean(per_cap), 
    NPS = nps(q4)) |> 
  gather(metric, score, -.cluster, -N) |> 
  mutate(
    label = str_glue("K{.cluster}\n(n={N})")
  ) |> 
  ggplot(aes(x=score, y=fct_rev(label), fill=label)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = round(score,2)), hjust=0, nudge_x = 0.02, fontface="bold") +
  scale_fill_viridis_d() +
  theme(
    axis.text.x = element_blank(),
    strip.text = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  facet_wrap(~ metric, scales = "free_x") +
  labs(x="", y="Cluster")

plot_core +
  scale_x_continuous(expand = expansion(mult = c(0, 0.5)))

It’s also a good practice to consider scale and how substantial each of these segments is. Our least profitable segment (according to this segmentation) is also our biggest share of customers at 32%.

Code

k4 |> 
  group_by(cluster = .cluster) |> 
  count() |> 
  ungroup() |> 
  mutate(
    p = prop.table(n),
    label = str_glue("K{cluster}\n{percent(p, 1)}")
    ) |> 
  ggplot(aes(area = n, fill=n, label=label)) +
  treemapify::geom_treemap(show.legend = FALSE) +
  treemapify::geom_treemap_text(color = "white") +
  theme(
    plot.margin = margin(0, 0, 0, 0, "pt")
  )

Behavioral characteristics

Code

k4 |> 
  select(.cluster, nps = q4, locations:per_cap) |> 
  mutate(across(where(is.numeric), \(x) as.numeric(scale(x)))) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score)) |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=1.2, show.legend = FALSE) +
  geom_point(size = 3) +
  scale_color_viridis_d() +
  labs(
    title = "Behavioral Differences",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

K4–High Satisfaction; Low Profitability (Free Riders)

Let’s take a closer look at K4, since it is both substantial and economically challenging, relative to other customer segments. As we discussed in class, this is the classic “Free Rider” case, where we’re delivering a lot of value for this segment but they are not reciprocating. This segment transacted recently, was originated recently, had a low transaction value, and has visited very few locations (often only one). This seems to fit the profile of the Reese segment in Brad Dahlstrom’s qualitative study; it fits the profile of our tourist segment.

Code

k4 |> 
  select(.cluster, nps = q4, locations:per_cap) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
    ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE, show.legend = FALSE) +
  geom_point(size = MAIN_POINT) +
  scale_color_manual(values = c(rep("grey80", 3), pal[2])) +
  labs(
    title = "Behavioral Differences",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

The above plot is mean scores of scaled data. We might glean more insight by looking at the raw data. In these terms, we can see that this segment indeed reflects a tourist visitation pattern. The average customer in this segment originated around 15 days ago and their last visit was also about 15 days ago.

Code

k4 |> 
  select(recency, origination, cluster = .cluster) |> 
  mutate(cluster = factor(cluster)) |> 
  gather(metric, score, -cluster) |> 
  group_by(cluster, metric) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=mu, y=fct_rev(cluster), fill=cluster)) +
  geom_bar(stat = "identity", position="stack", show.legend = FALSE) +
  geom_text(aes(label = round(mu, 0)), hjust=0, fontface="bold") +
  scale_fill_manual(values = c(rep("grey80", 3), pal[2])) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.2))) +
  theme(
    axis.text.x = element_blank()
  ) +
  facet_wrap(~ metric, scales = "free_x") +
  labs(
    x="",
    y="Cluster"
  )

We can also look at the maximum number of locations visited vs the median number. While the maximum is not the lowest value, the median is. Most of these customers visited only one location. Again, this supports a hypothesis that this segment is the tourist (“Reese”) segment.

Code

k4 |> 
  mutate(cluster = factor(.cluster)) |> 
  group_by(cluster) |> 
  summarise('Median Locations' = median(locations), 'Maximum Locations' = max(locations), .groups = "drop") |> 
  gather(metric, score, -cluster) |> 
  ggplot(aes(x=score, y=fct_rev(cluster), fill=cluster)) + 
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = score), hjust = 0, fontface = "bold", nudge_x = 0.1) +
  scale_fill_manual(values = c(rep("grey80", 3), pal[2])) +
  theme(
    axis.text.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  facet_wrap(~metric, scales = "free_x") +
  xlim(0,15) +
  labs(
    x="",
    y="Cluster"
  )

What about the categorical variables in our behavioral data (unlimited membership, roaster’s membership, and WiFi registration)? Here again, the data corroborates the hypothesis. This segment has the lowest membership in our loyalty programs and low registration for WiFi service.

Code

k4 |> 
  select(cluster = .cluster, unlimited:wifi) |> 
  mutate(cluster = factor(cluster)) |> 
  gather(metric, score, -cluster) |> 
  group_by(cluster, metric) |> 
  summarise(mu = sum(score == "1")/n(), .groups = "drop") |> 
  ggplot(aes(x=mu, y=fct_rev(cluster), fill=cluster)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = percent(mu, 1)), hjust = 0, fontface = "bold") +
  scale_fill_manual(values = c("grey80", "grey80", "grey80", pal[2])) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.2))) +
  theme(
    axis.text.x = element_blank()
  ) +
  facet_wrap(~ metric, nrow = 2) +
  labs(
    title = "Categorical Behavior",
    x="", y="Cluster"
  )

Finally, let’s explore the survey data for K4. Across the board, this segment has low to average scores. Nothing seems that important.

Code

k4 |> 
  select(.cluster, starts_with("q2_")) |> 
  rename(
    Mission = 'q2_1',
    Coffee = 'q2_2',
    Comfort = 'q2_3',
    Food = 'q2_4',
    WiFi = 'q2_5'
  ) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
  ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE, show.legend = FALSE) +
  geom_point(size = MAIN_POINT) +
  scale_color_manual(values = c(rep("grey80", 3), pal[2])) +
  labs(
    title = "Survey Differences--Importance Attributes",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

Despite the fact that less than a quarter of this segment claims they are local, almost half follow us on social media, which suggests this is our tourist segment.

Code

plot_local <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q1 == "1")/n()) |> 
  base_bar(x=mu, y=.cluster, fill = .cluster, round = 2, title = "Q1: Local") +
  scale_fill_manual(values = c(rep("grey80", 3), pal[2])) +
  xlim(0,1.1)

plot_social <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q3 == "1")/n()) |> 
  base_bar(x = mu, y=.cluster, fill=.cluster, round = 2, title = "Q3: Social Follow") +
  scale_fill_manual(values = c(rep("grey80", 3), pal[2])) +
  xlim(0,1.1)


plot_local + plot_social

K1–High Profitability; Low Satisfaction (Vulnerable Customers)

Now, let’s look at the reverse: our customers who make the most profit but have the lowest satisfaction. The data here seems less clear. We would expect Brooke to have the highest number of locations, but we wouldn’t expect her to over-index Grady on Roaster’s Club.

Code

k4 |> 
  select(.cluster, nps = q4, locations:per_cap) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
    ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  mutate(color = ifelse(.cluster == "1", pal[2], "grey70")) |> 
  ggplot(aes(x=variable, y=mu, color = color, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE) +
  geom_point(size = 3) +
  scale_color_identity() +
  labs(
    title = "Behavioral Differences",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

Code

k4 |> 
  select(cluster = .cluster, unlimited:wifi) |> 
  mutate(cluster = factor(cluster)) |> 
  gather(metric, score, -cluster) |> 
  group_by(cluster, metric) |> 
  summarise(mu = sum(score == "1")/n(), .groups = "drop") |> 
  ggplot(aes(x=mu, y=fct_rev(cluster), fill=cluster)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = percent(mu, 1)), hjust = 0, fontface = "bold") +
  scale_fill_manual(values = c(pal[2], "grey80", "grey80", "grey80")) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.2))) +
  theme(
    axis.text.x = element_blank()
  ) +
  facet_wrap(~ metric, nrow = 2) +
  labs(
    title = "Categorical Behavior",
    x="", y="Cluster"
  )

The survey data suggests that this might be Grady’s cohort. This cluster had the highest scores agreeing statement that “The quality of Hecuba’s coffee” is important.

Code

k4 |> 
  select(.cluster, starts_with("q2_")) |> 
  rename(
    Mission = 'q2_1',
    Coffee = 'q2_2',
    Comfort = 'q2_3',
    Food = 'q2_4',
    WiFi = 'q2_5'
  ) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
  ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE, show.legend = FALSE) +
  geom_point(size = 3) +
  scale_color_manual(values = c(pal[2], rep("grey80", 3))) +
  labs(
    title = "Survey Differences--Importance Attributes",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

The segment is almost entirely local, but it doesn’t follow on social media. It’s sounding more and more like Grady’s cohort.

Code

plot_local <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q1 == "1")/n()) |> 
  base_bar(x=mu, y=.cluster, fill = .cluster, round = 2, title = "Q1: Local") +
  scale_fill_manual(values = c(pal[2], rep("grey80", 3))) +
  xlim(0,1.1)

plot_social <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q3 == "1")/n()) |> 
  base_bar(x = mu, y=.cluster, fill=.cluster, round = 2, title = "Q3: Social Follow") +
  scale_fill_manual(values = c(pal[2], rep("grey80", 3))) +
  xlim(0,1)


plot_local + plot_social

K2 & K3–Steady Customers

Now, let’s consider K2 and K3. Each of these segments has decent satisfaction and profitability, with K3 slightly under-indexing on the latter dimension. The high origination for K3 suggests the segment identified as Michelle, though it is puzzling that this segment under-indexes on recency. High origination for K2 suggests Michelle.

Code

k4 |> 
  select(.cluster, nps = q4, locations:per_cap) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
    ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE, show.legend = FALSE) +
  geom_point(size = 3) +
  scale_color_manual(values = c("grey70", pal[2:3], "grey70")) +
  labs(
    title = "Behavioral Differences",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

Categorically, segment 2 is more likely to be enrolled in unlimited, which tracks with qualitative research on the Michelle segment–our loyalists. wifi is higher with segment 3, which tracks with Brooke.

Code

k4 |> 
  select(cluster = .cluster, unlimited:wifi) |> 
  mutate(cluster = factor(cluster)) |> 
  gather(metric, score, -cluster) |> 
  group_by(cluster, metric) |> 
  summarise(mu = sum(score == "1")/n(), .groups = "drop") |> 
  ggplot(aes(x=mu, y=fct_rev(cluster), fill=cluster)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = percent(mu, 1)), hjust = 0, fontface = "bold") +
  scale_fill_manual(values = c("grey80", pal[2:3], "grey80")) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.2))) +
  theme(
    axis.text.x = element_blank()
  ) +
  facet_wrap(~ metric, nrow = 2) +
  labs(
    title = "Categorical Behavior",
    x="", y="Cluster"
  )

The location data also corroborates what we might expect from our Brooke segment. They max out at 10 locations vs 5 for segment 2.

Code

k4 |> 
  mutate(cluster = factor(.cluster)) |> 
  group_by(cluster) |> 
  summarise('Median Locations' = median(locations), 'Maximum Locations' = max(locations), .groups = "drop") |> 
  gather(metric, score, -cluster) |> 
  ggplot(aes(x=score, y=fct_rev(cluster), fill=cluster)) + 
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = score), hjust = 0, fontface = "bold", nudge_x = 0.1) +
  scale_fill_manual(values = c("grey80", pal[2:3], "grey80")) +
  theme(
    axis.text.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  facet_wrap(~metric, scales = "free_x") +
  xlim(0,15) +
  labs(
    x="",
    y="Cluster"
  )

The survey data would support an argument of segment 2 being aligned with Michelle, as the mission is the most important aspect, whereas segment 3 seems to have few high inflection points.

Code

k4 |> 
  select(.cluster, starts_with("q2_")) |> 
  rename(
    Mission = 'q2_1',
    Coffee = 'q2_2',
    Comfort = 'q2_3',
    Food = 'q2_4',
    WiFi = 'q2_5'
  ) |> 
  mutate(
    across(where(is.numeric), \(x) as.numeric(scale(x)))
  ) |> 
  gather(variable, score, -.cluster) |> 
  group_by(.cluster, variable) |> 
  summarise(mu = mean(score), .groups = "drop") |> 
  ggplot(aes(x=variable, y=mu, color = .cluster, shape=.cluster, group = .cluster)) +
  geom_line(linewidth=MAIN_LINE, show.legend = FALSE) +
  geom_point(size = 3) +
  scale_color_manual(values = c("grey80", pal[2:3], "grey80")) +
  labs(
    title = "Survey Differences--Importance Attributes",
    subtitle = "Scaled Data by Cluster",
    x="", y=""
  )

Finally, the high local and moderate social media activity are aligned with Michelle and Brooke’s qualitative profiles.

Code

plot_local <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q1 == "1")/n()) |> 
  base_bar(x=mu, y=.cluster, fill = .cluster, round = 2, title = "Q1: Local") +
  scale_fill_manual(values = c("grey80", pal[2:3], "grey80")) +
  xlim(0,1.1)

plot_social <- k4 |> 
  group_by(.cluster) |> 
  summarise(mu = sum(q3 == "1")/n()) |> 
  base_bar(x = mu, y=.cluster, fill=.cluster, round = 2, title = "Q3: Social Follow") +
  scale_fill_manual(values = c("grey80", pal[2:3], "grey80")) +
  xlim(0,1)


plot_local + plot_social

4. Other Considerations

We looked at the aggregate NPS scores, but what about the categories?

Code

nps_scores <- k4 |> 
  group_by(cluster = .cluster) |> 
  summarise(p = nps(q4), .groups = "drop") 



k4 |>
  mutate(
    npc = npc(q4)
    ) |>
  group_by(cluster = .cluster, npc) |>
  count() |>
  ungroup() |>
  group_by(cluster) |>
  mutate(
    p = prop.table(n),
    label_color = ifelse(npc == "Detractor", "black", "white")) |>
  ggplot(aes(x=p, y=fct_rev(cluster), fill=fct_rev(npc))) +
  geom_bar(stat = "identity", position = position_stack()) +
  geom_text(aes(label = percent(p, 1), color = fct_rev(npc)), position = position_stack(vjust = 0.5), fontface = "bold", show.legend = FALSE) +
  scale_fill_viridis_d() +
  scale_color_manual(values = c("white", "white", "black")) +
  guides(fill = guide_legend(reverse = TRUE, title = "NPS Category")) +
  theme(
    legend.position = "top",
    legend.justification = "left",
    axis.text.x = element_blank()
  ) +
  labs(
    x="", y="Cluster"
  )

Also, can we trace the problematic whale curve customers to clusters?

Code

k4 |> 
  filter(per_cap < 0) |> 
  count(.cluster) |> 
  ggplot(aes(x=n, y=fct_rev(.cluster), fill=.cluster)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = number(n)), color = "white", fontface="bold", hjust=1) +
  scale_fill_manual(values = c(rep("grey60", 3), pal[2])) +
  theme(
    axis.text.x = element_blank()
  ) +
  labs(
    x="", y="Cluster",
    title = "Unprofitable customers by cluster"
  )