SMS Spam Filtering Example • TAGAM

To explore the functions in the TAGAM package, we will be using a dataset containing SMS messages. The dataset will be downloaded from the UCI Machine Learning Repository, and more information can be found at http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/. Using the SMS messages data, we will…

convert the narratives to numeric vectors
fit a 3 stage generalized additive model (GAM)
predict results for a test set

library(TAGAM)
library(tidyverse)
library(tidytext)

Data

As previously mentioned, we will download data from UCI Machine Learning Repository.

temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", temp)
spam <- read.delim(unz(temp, "SMSSpamCollection"),
                   header = FALSE,
                   quote = "",
                   stringsAsFactors = FALSE)
names(spam) <- c("spam", "text")

samp <- 1:1674
training <- spam[samp, ]
y_training <- spam$spam[samp]

testing <- spam[-samp, ]
y_testing <- spam$spam[-samp]

Text Conversion

To convert the textual descriptions of the property loss noaa_subset$event_narrative to numeric vectors, we must download and format the word embeddings. Here, we choose to use the 300 dimensional GloVe word embeddings. The formatWordEmbeddings function takes the raw word embeddings matrix and converts them into a named list, where each entry is a numeric vector of length 300. For example, we would use word_embeddings["the"] to extract the numeric vector for the word the.

# download.file("http://nlp.stanford.edu/data/wordvecs/glove.6B.zip", "glove.6B.zip")
# unzip("glove.6B.zip")
embedding_matrix <- read_table2("glove.6B.300d.txt", col_names = FALSE)
word_embeddings <- formatWordEmbeddings(embedding_matrix, normalize = TRUE)

cs.matrix is used to construct the cosine similarity matrix from the word embeddings. In order to use cs.matrix, the descriptions need to be reformatted into a tibble with columns line and word. This can be done using the tidytext package,

tidy_training <- tibble(line = 1:nrow(training), text = training$text) %>%
  unnest_tokens(word, text) %>%
  filter(word %in% names(word_embeddings)) %>%
  filter(!word %in% stop_words$word)

tidy_testing <- tibble(line = 1:nrow(testing), text = testing$text) %>%
  unnest_tokens(word, text) %>%
  filter(word %in% names(word_embeddings)) %>%
  filter(!word %in% stop_words$word)

In addition, we would like to remove words that are not in word_embeddings or words that provide very little meaning to the context. Therefore, we use the following filter to make sure we only have words in word_embeddings and we remove all stop words and words that contain numbers.

Now we need to choose which words we would like to represent our covariates. The simplest way to choose the words is to use all words in tidy_training. Then the cosine similarities matrices are constructed.

words <- table(tidy_training$word)
words <- names(words)[words >= 5]

cs.training <- cs.matrix(tidy_training, words = words, word_embeddings,
                         epsilon = 0.2, parallel = TRUE, n.cluster = 8)
cs.testing <- cs.matrix(tidy_testing, words = words, word_embeddings,
                        epsilon = 0.2, parallel = TRUE, n.cluster = 8)

tidy_training_summary_vars <- tibble(line = 1:nrow(training), text = training$text) %>%
  unnest_tokens(word, text) %>%
  mutate(not_word_in_WE = !word %in% names(word_embeddings)) %>%
  group_by(line) %>%
  summarise(num_word_not_in_WE = mean(not_word_in_WE))
tidy_training_punc_vars <- tibble(line = 1:nrow(training), text = training$text) %>%
  unnest_tokens(word, text, strip_punct = FALSE) %>%
  group_by(line) %>%
  summarise(num_exclamation = sum(word == "!"),
            num_period = sum(word == "."),
            num_comma = sum(word == ","),
            num_ampersand = sum(word == "&"),
            num_forwardslash = sum(word == "/"))
num_caps <- sapply(regmatches(training$text, gregexpr("[A-Z]", training$text, perl = TRUE)), length)
training_prop_caps <- num_caps/(num_caps + sapply(regmatches(training$text, gregexpr("[a-z]", training$text, perl = TRUE)), length))
training_prop_caps <- ifelse(is.na(training_prop_caps), 0, training_prop_caps)


tidy_testing_summary_vars <- tibble(line = 1:nrow(testing), text = testing$text) %>%
  unnest_tokens(word, text) %>%
  mutate(not_word_in_WE = !word %in% names(word_embeddings)) %>%
  group_by(line) %>%
  summarise(num_word_not_in_WE = mean(not_word_in_WE))
tidy_testing_summary_vars <- left_join(data.frame("line" = 1:nrow(testing)), tidy_testing_summary_vars)
tidy_testing_summary_vars$num_word_not_in_WE[is.na(tidy_testing_summary_vars$num_word_not_in_WE)] <- 0
tidy_testing_punc_vars <- tibble(line = 1:nrow(testing), text = testing$text) %>%
  unnest_tokens(word, text, strip_punct = FALSE) %>%
  group_by(line) %>%
  summarise(num_exclamation = sum(word == "!"),
            num_period = sum(word == "."),
            num_comma = sum(word == ","),
            num_ampersand = sum(word == "&"),
            num_forwardslash = sum(word == "/"))
num_caps <- sapply(regmatches(testing$text, gregexpr("[A-Z]", testing$text, perl = TRUE)), length)
testing_prop_caps <- num_caps/(num_caps + sapply(regmatches(testing$text, gregexpr("[a-z]", testing$text, perl = TRUE)), length))
testing_prop_caps <- ifelse(is.na(testing_prop_caps), 0, testing_prop_caps)

Model Fitting

We can use the gam3 function to fit a generalized additive model via a 3 stage approach. gam3 takes arguments similar to gam/bam from the mgcv package, in addition to several arguments from cv.gglasso from the gglasso package.

training_data <- data.frame((y_training == "spam") + 0, tidy_training_summary_vars[, -1],
                            tidy_training_punc_vars[, -1], training_prop_caps, as.matrix(cs.training))
names(training_data) <- c("y", paste("X", 1:(ncol(training_data)-1), sep = ""))

m <- c(2, 1)
k <- 4

terms <- paste("s(", names(training_data)[-1], ", bs = 'ps', ", "m = ", m, ", k = ", k, ")", sep = "")
formula <- as.formula(paste("y ~", paste(terms, collapse = " + ")))
lambda.factor <- 0.0001

set.seed(2020)
gam3.mod <- gam3(formula, training_data, gam.function = "bam", lambda.factor = lambda.factor, discrete = TRUE, family = binomial())
#> Starting stage 1
#> Starting stage 2
#> Starting stage 3
#> Warning in bgam.fitd(G, mf, gp, scale, nobs.extra = 0, rho = rho, coef = coef, :
#> fitted probabilities numerically 0 or 1 occurred

testing_data <- data.frame((y_testing == "spam") + 0, tidy_testing_summary_vars[, -1],
                           tidy_testing_punc_vars[, -1], testing_prop_caps, as.matrix(cs.testing))
names(testing_data) <- c("y", paste("X", 1:(ncol(testing_data)-1), sep = ""))


prediction <- ifelse(predict(gam3.mod, newdata = testing_data, type = "response") < 0.5, 0, 1)
results <- table("true" = testing_data$y, prediction)
results
#>     prediction
#> true    0    1
#>    0 3353   38
#>    1   62  447

TP <- results[2, 2]
TN <- results[1, 1]
FP <- results[1, 2]
FN <- results[2, 1]

# Accuracy
(TP+TN)/(TP+TN+FP+FN)
#> [1] 0.974359

# SC
TP/(TP+FN)
#> [1] 0.8781925

# BH
FP/(TN+FP)
#> [1] 0.01120613

# MCC
(TP*TN - FP*FN)/(sqrt(TP+FP)*sqrt(TP+FN)*sqrt(TN+FP)*sqrt(TN+FN))
#> [1] 0.8850521