We performed additional validation of our analytical approach and findings. First, we performed an analysis of interobserver reliability in call classification at the individual scale across multiple observers. Second, we compared visual inspection, spectrographic cross-correlation (SPCC) and random forests as similarity methods. Finally, we asked whether or not a difference in social context between the individual and higher social scales could have skewed results for monk parakeets.
Our overall results can be reproduced by using the data provided with this publication (extended selection tables for the subset of calls used for visual classification, and the matrix of visual similarity combined across observers). Some changes to this code may be necessary, such as directories used for analysis. Users should expect slight differences in results from some unsupervised or supervised machine learning methods, due to stochasticity associated with these methods.
rm(list = ls())
X <- c("tidyverse", "vegan", "pbapply", "dplyr", "data.table", "mclust", "effsize", "knitr", "warbleR")
invisible(lapply(X, library, character.only = TRUE))
theme_AcousticSpace <- function(){
theme(panel.background = element_rect(fill = "white"), plot.background = element_rect(fill = "white"),
panel.grid.major = element_line(size = 1, colour = "white"),
panel.grid.minor = element_line(size = 0.75, colour = "white"),
axis.line = element_line(size = 0.45, colour = "black"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12))
}
Set directory paths for analysis.
in_path_st <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/www"
in_path_classes <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/visual-inspection/classifications"
out_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"
Set general parameters.
social_scales <- c("Individual", "Pair", "Flock", "Site")
perms <- 9999
cores <- parallel::detectCores() - 2
seed <- 401
Read in .csv files of human classifications, obtained from Shiny app.
files <- list.files(pattern = ".csv", path = in_path_classes)
# files
csvs <- lapply(1:length(files), function(x){ read.csv(file.path(in_path_classes, files[x])) })
length(csvs)
## [1] 12
Here we asked how consistently human observers classified calls at the individual scale. This was the only social scale at which it was possible to calculate classification accuracy by human visual inspection.
# write a loop to extract individual scale information, assign individual IDs to classes by a majority rule and calculate classification accuracy per observer
observs <- 1:length(csvs)
classif_accur <- invisible(unlist(pblapply(observs, function(i){
tmp <- t(csvs[[i]][grep("Individual", csvs[[i]]$social_scale), grep("class_", names(csvs[[i]]))])
tmp <- data.frame(class = row.names(tmp), spectros = tmp[, 1])
# iterate over classes to assign an individual ID to each class
class_IDs <- unlist(lapply(1:nrow(tmp), function(x){
# obtain calls for the given class
calls <- as.character(tmp$spectros[x])
calls <- strsplit(calls, split = ";")[[1]]
# obtain individual IDs from the spectrogram image names
indivs <- sapply(1:length(calls), function(z){
strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
})
if(all(!is.na(indivs))){
# find the individual with the most spectrograms assiged to this class
# this individual ID will become assigned to the given class to facilitate calculating classification accuracy
itbl <- table(indivs)
m <- max(itbl)
class_ID <- names(itbl)[which(itbl == m)]
} else {
class_ID <- NA
}
return(class_ID)
}))
# if each class could be assigned a unique individual ID
if(all(!is.na(class_IDs)) & !any(duplicated(class_IDs))){
tmp_class_IDs <- class_IDs
# if some classes were left empty
} else if(any(is.na(class_IDs))){
tmp_class_IDs <- class_IDs[!is.na(class_IDs)]
# if some classes were evenly assigned to different individuals
} else if(any(duplicated(class_IDs)) & all(!is.na(class_IDs))){
tmp_class_IDs <- class_IDs[!duplicated(class_IDs)]
}
wrong <- unlist(lapply(1:nrow(tmp), function(x){
# obtain calls for the given class
calls <- as.character(tmp$spectros[x])
calls <- strsplit(calls, split = ";")[[1]]
# obtain individual IDs from the spectrogram image names
indivs <- sapply(1:length(calls), function(z){
strsplit(strsplit(calls[z], split = "INDIV-")[[1]][2], split = "_")[[1]][1]
})
itbl <- table(indivs)
wrong <- as.vector(itbl[which(names(itbl) != tmp_class_IDs[x])])
return(wrong)
}))
wrong_class <- sum(wrong)
# extract the total number of calls classified
N <- sum(unlist(lapply(1:nrow(tmp), function(z){
calls <- as.character(tmp$spectros[z])
length(strsplit(calls, split = ";")[[1]])
})))
# calculate classification accuracy over all calls
return(((N - wrong_class)/N)*100)
})))
classif_accur
## [1] 75.00000 87.50000 75.00000 87.50000 55.55556 87.50000 50.00000
## [8] 81.25000 75.00000 75.00000 37.50000 75.00000
mean(classif_accur)
## [1] 71.81713
range(classif_accur)
## [1] 37.5 87.5
sd(classif_accur)
## [1] 15.94189
Read in extended selection tables per social scale used in Shiny app. These will determine how calls will be ordered in matrices across social scales.
# read in the extended selection tables after moving them into the Shiny app working directory
indiv_shiny_tbl <- readRDS(file.path(path = in_path_st, "indiv_scale_shiny_extended_seltable.RDS"))
pair_shiny_tbl <- readRDS(file.path(path = in_path_st, "pair_scale_shiny_extended_seltable.RDS"))
flock_shiny_tbl <- readRDS(file.path(path = in_path_st, "flock_scale_shiny_extended_seltable.RDS"))
site_shiny_tbl <- readRDS(file.path(path = in_path_st, "site_scale_shiny_extended_seltable.RDS"))
sel_tables <- list(indiv_shiny_tbl, pair_shiny_tbl, flock_shiny_tbl, site_shiny_tbl)
Write a function to convert human classifications into a list of binary matrices (one per social scale).
df2binmat <- function(X, class_prefix){
# iterate over social scales to make a list of binary matrices
mat_list <- invisible(pblapply(1:length(social_scales), function(i){
# get all calls for the given social scale
sel_table <- sel_tables[[i]]
calls_df <- data.frame(call_ID = paste(sel_table$sound.files, "-1.jpeg", sep = ""))
# now get the class labels for these calls
# first extract calls and classes for the given social scale
tmp <- droplevels(X[grep(paste("^", social_scales[i], "$", sep = ""), X$social_scale), ])
class_cols <- tmp[, grep(paste(paste("^", class_prefix, sep = ""), collapse = "|"), names(X))]
class_cols <- t(class_cols)
# then iterate over rows and turn this into a data frame with one row per call
class_df <- data.table::rbindlist(lapply(1:nrow(class_cols), function(y){
calls_tmp <- unlist(strsplit(class_cols[y, ], split = ";"))
df <- data.frame(class_label = row.names(class_cols)[y], call_ID = calls_tmp)
return(df)
}))
# iterate over rows of the calls data frame for the given social scale, pulling out class labels from the class data frame
final_df <- data.table::rbindlist(lapply(1:nrow(calls_df), function(y){
call_tmp <- calls_df[y, ]
label <- as.character(class_df$class_label[grep(paste("^", call_tmp, "$", sep = ""), class_df$call_ID)])
return(data.frame(call_ID = call_tmp, class_label = label, ID = as.character(sel_table$ID[y])))
}))
# iterate over each row in this data frame and compare the label of the given call to labels of all other calls
# this comparison is the same as asking whether or not calls were classified in the same group
# this will return a long binary vector that can be used to build the final matrix
class_comps <- unlist(lapply(1:nrow(calls_df), function(y){
as.numeric(final_df$class_label[y] == final_df$class_label)
}))
# build the final matrix of human classifications, which contains binary encoding for whether or not calls were classified in the same group per observer and social scale
human_class_mat <- matrix(class_comps, nrow = nrow(final_df), ncol = nrow(final_df), byrow = TRUE)
dimnames(human_class_mat) <- list(final_df$ID, final_df$ID)
return(human_class_mat)
}))
names(mat_list) <- social_scales
return(mat_list)
}
human_class_mats_by_observer <- pblapply(1:length(csvs), function(x){
df2binmat(X = csvs[[x]], class_prefix = "class")
})
names(human_class_mats_by_observer) <- paste("Observer", seq(1, length(human_class_mats_by_observer), 1), sep = "-")
# str(human_class_mats_by_observer)
saveRDS(human_class_mats_by_observer, file.path("/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings", "human_class_mats_by_observer.RDS"))
These matrices look good. We moved on to subsetting SPCC and random forests acoustic similarity matrices for the given calls per social scale.
# read in repeatedly sampled individuals SPCC and random forests matrices
# also the SPCC and random forests matrices for higher social scales
i_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Focal_Individuals"
h_path <- "/media/owner/MYIOPSITTA/R/Uruguay2017_MonkParakeet_CallAnalysis/Data/Site_Recordings"
human_class_mats_by_observer <- readRDS(file.path(h_path, "human_class_mats_by_observer.RDS"))
# repeatedly sampled individuals
# needs to be filtered by calls used in the human classification, which were the same calls used for random forests validation
xc_mat_i <- readRDS(file.path(i_path, "xc_mat_indiv_scale.RDS")) # SPCC
# str(xc_mat_i)
# this already contains only calls in the random forests validation data set (e.g. the individuals used in human classification)
# has no dimension names
rf_mat_i <- readRDS(file.path(h_path, "proxm_ms.RDS")) # random forests
# str(rf_mat_i)
# higher social scales
# these need to be filtered for pair, flock and site calls used in human classification
xc_mat_h <- readRDS(file.path(h_path, "xc_mat_higher_social_scales.RDS")) # SPCC
# str(xc_mat_h)
# add dimension names to the rf_mat to facilitate filtering, these are the same as the xc_mat for higher social scales
rf_mat_h <- readRDS(file.path(h_path, "proxm_site.RDS")) # random forests
dimnames(rf_mat_h) <- dimnames(xc_mat_h)
# str(rf_mat_h)
Subset the SPCC and random forests matrices for repeatedly sampled individuals used in human classification.
sel_table <- sel_tables[[1]]
# unique(sel_table$scale) # checking
# dimnames(xc_mat_i)
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_i_sub <- xc_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_i)[[2]])]
# dimnames(xc_mat_i_sub) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_i_sub) <- list(sel_table$ID, sel_table$ID)
dim(xc_mat_i_sub)
# filter the random forests matrix by the call_IDs in the selection table used for human classification
# first give the random forests matrix dimension names
rf_dim_nms <- dimnames(xc_mat_i)[[1]][grep("RAW|ZW8|-BIRD3|-BIRD5", dimnames(xc_mat_i)[[1]])]
dimnames(rf_mat_i) <- list(rf_dim_nms, rf_dim_nms)
rf_mat_i_sub <- rf_mat_i[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_i)[[2]])]
# dimnames(rf_mat_i_sub) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_i_sub) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_indiv_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_indiv_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_indiv_ID # checking
Read in the selection table and metadata for calls at higher social scales.
ccs <- read.csv(file.path(h_path, "higher_social_scales_calls_preprocessed_final.csv"), header = TRUE)
Subset the SPCC and random forests matrices for pair calls used in human classification.
sel_table <- sel_tables[[2]]
# unique(sel_table$scale) # checking
# check out call IDs between the human classification selection table and the SPCC matrix
# sel_table$sound.files
# dimnames(xc_mat_h)[[1]]
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_pairs <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_pairs) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_pairs <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_pairs) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_pairs) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_pair_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_pair_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_pair_ID # checking
Subset the SPCC and random forests matrices for flock calls used in human classification.
sel_table <- sel_tables[[3]]
# unique(sel_table$scale) # checking
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_flocks <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_flocks) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_flocks <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_flocks) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_flocks) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_flock_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_flock_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_flock_ID # checking
Subset the SPCC and random forests matrices for site calls used in human classification. Also, make a binary matrix of group identity for these site-scale calls.
sel_table <- sel_tables[[4]]
# unique(sel_table$scale) # checking
# make the selection table call IDs match the SPCC matrix dimension names
call_IDs <- sel_table$sound.files
call_IDs <- gsub("WAV_", "WAV-", call_IDs)
# filter the SPCC matrix by the call_IDs in the selection table used for human classification
xc_mat_h_sites <- xc_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(xc_mat_h)[[2]])]
# dimnames(xc_mat_h_sites) # checking
# replace dimnames to be friendlier for plotting
dimnames(xc_mat_h_sites) <- list(sel_table$ID, sel_table$ID)
# repeat filtering for the random forests matrix
rf_mat_h_sites <- rf_mat_h[grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[1]]), grep(paste(paste("^", call_IDs, "$", sep = ""), collapse = "|"), dimnames(rf_mat_h)[[2]])]
# dimnames(rf_mat_h_sites) # checking
# replace dimnames to be friendlier for plotting
dimnames(rf_mat_h_sites) <- list(sel_table$ID, sel_table$ID)
# make the binary matrix of group identity at this social scale
group_IDs <- sapply(1:nrow(sel_table), function(x){
as.numeric(sel_table$ID[x] == sel_table$ID)
})
binmat_site_ID <- matrix(group_IDs, byrow = TRUE, nrow = nrow(sel_table), ncol = nrow(sel_table))
dimnames(binmat_site_ID) <- list(sel_table$ID, sel_table$ID)
# binmat_site_ID # checking
The subsetted SPCC and random forests matrices look good, as do the binary matrices of group identity across social scales. We used a model-based clustering approach to ask how well clustering algorithms classified calls based on similarity matrices per method.
# x <- 2
human_mat_list <- invisible(pblapply(1:length(social_scales), function(x){
ss <- social_scales[x]
# extract human classification matrices across observers for this social scale
human_class_mats <- lapply(1:length(human_class_mats_by_observer), function(x){
human_class_mats_by_observer[[x]][[ss]]
})
# add matrices for this scale together across observers
# yields a matrix in which each cell indicates how often calls were classified together across observers
human_class_mat <- Reduce('+', human_class_mats)
# scale the matrix values between 0 and 1 to be as similar as possible to the SPCC and random forests matrices
human_class_mat <- apply(human_class_mat, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
return(human_class_mat)
}))
names(human_mat_list) <- social_scales
SPCC_mat_list <- list(xc_mat_i_sub, xc_mat_h_pairs, xc_mat_h_flocks, xc_mat_h_sites)
names(SPCC_mat_list) <- social_scales
rf_mat_list <- list(rf_mat_i_sub, rf_mat_h_pairs, rf_mat_h_flocks, rf_mat_h_sites)
names(rf_mat_list) <- social_scales
asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
asim_mat_list <- list(human_mat_list, SPCC_mat_list, rf_mat_list)
names(asim_mat_list) <- asim_method
Iterate over similarity methods and social scales to: 1) perform t-SNE per similarity method, 2) perform model-based clustering per simiarity method, 3) make a data frame with the new embeddings as well as metadata for plotting, including the model-based clustering classes.
df_list <- invisible(pblapply(1:length(asim_method), function(i){
asim_mats <- asim_mat_list[[asim_method[i]]]
# iterate over social scales
df_list <- lapply(1:length(social_scales), function(x){
ss <- social_scales[x]
# convert the similarity matrix to a distance matrix
# will be used for t-SNE and model-based clustering
dist_mat <- stats::as.dist(1 - asim_mats[[x]], diag = TRUE, upper = TRUE)
# perform t-SNE
# the optimized perplexity for focal individual feature extraction was too large for this t-SNE visualization, so we used the maximum perplexity possible given the dimensions of the distance matrix
plx <- (dim(asim_mats[[x]])[1]/3) - 1
# visualize with t-SNE in 2 dimensions
tmp <- list()
tmp <- lapply(1:10, function(n){
Rtsne::Rtsne(dist_mat, dims = 2, pca = FALSE, max_iter = 5000, perplexity = plx, is_distance = TRUE, theta = 0.0)
})
# str(tmp)
KL <- sapply(1:length(tmp), function(x){
tmp[[x]]$itercosts[100]
}, simplify = TRUE)
# overwrite tmp with the solution that minimized KL
tsne <- tmp[[which(KL == min(KL))]]
# perform model-based clustering while limiting the possible number of clusters to the true number of groups in each call data set
set.seed(seed)
mBIC_res <- mclust::Mclust(data = dist_mat, G = 1:4)
# make a data frame with output
df <- data.frame(asim_method = asim_method[i], social_scale = ss, X = tsne$Y[, 1], Y = tsne$Y[, 2], true_group = as.numeric(factor(dimnames(asim_mats[[x]])[[1]])), cluster = mBIC_res$classification)
return(df)
})
return(rbindlist(df_list))
}))
clustering_df <- rbindlist(df_list)
clustering_df$true_group <- factor(clustering_df$true_group)
clustering_df$cluster <- factor(clustering_df$cluster)
str(clustering_df)
saveRDS(clustering_df, file.path(out_path, "clustering_df.RDS"))
Here, find the number of calls misclassified per true group across social scales and similarity methods. The resulting data frame will be used for labels in the faceted plot below.
clustering_df <- readRDS(file.path(out_path, "clustering_df.RDS"))
# make a new column of numeric group ID
clustering_df$true_group_num <- as.numeric(clustering_df$true_group)
clustering_df$asim_method <- as.character(clustering_df$asim_method)
clustering_df$asim_method <- factor(clustering_df$asim_method, levels = c("Visual Inspection", "SPCC", "Random Forests"))
asim_method <- c("Visual Inspection", "SPCC", "Random Forests")
clusters <- 4 # each social scale has 4 clusters
# number of calls per group per social scale
num_spectros <- c(4, 2, 3, 4)
# iterate over similarity methods
df_list <- invisible(pblapply(1:length(asim_method), function(i){
# iterate over social scales
df_list <- lapply(1:length(social_scales), function(x){
am <- asim_method[i]
ss <- social_scales[x]
# subset data and count observations per cluster identified by model-based clustering
clust_df <- clustering_df %>%
filter(as.character(asim_method) == am) %>%
filter(social_scale == ss) %>%
droplevels()
# setting parameters for placing text labels in faceted plot below
if(am == "Visual Inspection" & ss == "Individual"){
X <- -Inf
Y <- -Inf
} else if(am == "SPCC" & ss == "Pair"){
X <- Inf
Y <- -Inf
} else if(am == "Random Forests" & ss == "Pair"){
X <- Inf
Y <- -Inf
} else if(am == "Visual Inspection" & ss == "Flock"){
X <- -Inf
Y <- -Inf
} else {
X <- -Inf
Y <- -Inf
}
# iterate over clusters to find the number of incorrectly assigned calls per cluster
incorr <- unlist(lapply(1:clusters, function(y){
tmp <- clust_df %>%
filter(cluster == y) %>%
droplevels()
if(nrow(tmp) < num_spectros[x] & length(unique(tmp$true_group)) == 1){
incorr <- num_spectros[x] - nrow(tmp)
} else if(nrow(tmp) > num_spectros[x] & length(unique(tmp$true_group)) > 1){
incorr <- nrow(tmp) - num_spectros[x]
} else if(nrow(tmp) <= num_spectros[x] & length(unique(tmp$true_group)) > 1){
incorr <- abs(length(unique(tmp$true_group)) - length(unique(tmp$cluster)))
# correctly assigned
} else if(nrow(tmp) == num_spectros[x] & length(unique(tmp$true_group)) == 1){
incorr <- 0
# if no calls were assigned to this cluster, these incorrect assignments will be picked up when evaluating other clusters
} else if(nrow(tmp) == 0){
incorr <- 0
}
return(incorr)
}))
incorr <- (sum(unlist(incorr))/nrow(clust_df)*100)
percent_corr <- paste(round(100 - incorr, digits = 2), "%", sep = "")
df <- data.frame(asim_method = am, social_scale = ss, percent_corr = percent_corr, X = X, Y = Y)
return(df)
})
return(rbindlist(df_list))
}))
corr_df <- rbindlist(df_list)
corr_df
## asim_method social_scale percent_corr X Y
## 1: Visual Inspection Individual 87.5% -Inf -Inf
## 2: Visual Inspection Pair 75% -Inf -Inf
## 3: Visual Inspection Flock 58.33% -Inf -Inf
## 4: Visual Inspection Site 87.5% -Inf -Inf
## 5: SPCC Individual 87.5% -Inf -Inf
## 6: SPCC Pair 75% Inf -Inf
## 7: SPCC Flock 66.67% -Inf -Inf
## 8: SPCC Site 43.75% -Inf -Inf
## 9: Random Forests Individual 87.5% -Inf -Inf
## 10: Random Forests Pair 75% Inf -Inf
## 11: Random Forests Flock 50% -Inf -Inf
## 12: Random Forests Site 68.75% -Inf -Inf
Make a faceted plot by smilarity method and social scales of the true group identity versus model-based clustering results. This is Supplementary Figure 6 in supplementary materials.
# initialize aesthetics
n <- 12
cols <- c(topo.colors(n, alpha = 1)[2], terrain.colors(n, alpha = 1)[2], heat.colors(n, alpha = 1)[1], heat.colors(n, alpha = 1)[5])
shps <- c(0, 21, 24, 6)
# find the min and max per level of grouping variables
x_maxes <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)
x_mins <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), min)
y_maxes <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), max, simplify = TRUE)
y_mins <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), min)
# find a good x and y buffer per combo of factor levels
x_buf <- tapply(clustering_df$X, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
x_buf
## Visual Inspection SPCC Random Forests
## Individual 52.69264 67.08727 25.08714
## Pair 75.17353 52.03722 79.10235
## Flock 68.02524 55.50820 196.42636
## Site 42.31252 23.08736 47.10373
y_buf <- tapply(clustering_df$Y, list(clustering_df$social_scale, clustering_df$asim_method), function(x){ return(max(x)/3) }, simplify = TRUE)
y_buf
## Visual Inspection SPCC Random Forests
## Individual 41.18420 36.37380 22.63860
## Pair 131.72327 43.23936 269.69292
## Flock 63.33374 56.66033 393.91938
## Site 51.56966 44.52101 45.91808
blank_data <- data.frame(asim_method = rep(corr_df$asim_method, 2), social_scale = rep(corr_df$social_scale, 2), x = c(as.vector(x_mins) - as.vector(x_buf), as.vector(x_maxes) + as.vector(x_buf)), y = c(as.vector(y_mins) - as.vector(y_buf), as.vector(y_maxes) + as.vector(y_buf)))
# make graphic
gg <- ggplot(data = clustering_df, aes(x = X, y = Y)) +
# yields nice strip text on top and right but cannot yield truly free axes
# facet_grid(rows = vars(social_scale), cols = vars(asim_method), scales = "free", shrink = TRUE, space = "fixed") +
facet_wrap(vars(social_scale, asim_method), ncol = length(unique(clustering_df$asim_method)), scales = "free") +
# labeller = label_values
geom_point(aes(shape = true_group), fill = gray.colors(n, alpha = 1)[2], size = 3.5) +
# overlay circular points for clusters
geom_point(aes(color = cluster), shape = 19, size = 2) +
scale_shape_manual(values = shps) +
guides(shape = guide_legend(title = "True Group"), color = guide_legend(title = "Cluster")) +
xlab("t-SNE Dimension 1") + ylab("t-SNE Dimension 2") +
geom_blank(data = blank_data, aes(x = x, y = y)) +
expand_limits(y = 0) + scale_y_continuous(expand = c(0, 0)) +
geom_text(data = corr_df, aes(label = percent_corr, vjust = -0.45), hjust = 0.1*c(-1, -1, -1, -1, -1, 11, -1, -1, -1, 11, -1, -1), size = 4, fontface = "bold") +
# scale_color_manual(colors = cols) +
theme(panel.background = element_rect(fill = "white", color = "black"), plot.background = element_rect(fill = "white"), panel.grid.major = element_line(size = 1, colour = "white"), panel.grid.minor = element_line(size = 0.75, colour = "white"), axis.line = element_line(size = 0.5, colour = "black"), axis.title = element_text(size = 14), axis.text = element_text(size = 10), strip.text = element_text(size = 11, face = "bold"), legend.position = "top", legend.title = element_text(size = 12), legend.text = element_text(size = 12)) +
ggtitle("")
gg
# ggsave(file.path(out_path, "SupplementaryFigure6_CompareAcousticMethods.tiff"), units = "in", height = 8.5, width = 6.9, dpi = 325)
# extract cluster colors from this plot for future reference
# p <- ggplot_build(gg)
# unique(p$data[[2]]["colour"])
We recorded repeated calls from individuals over narrow recording windows. Here we report how narrow these recording windows were per repeatedly sampled individual, and whether acoustic similarity changed signficantly over these sampling windows per individual.
Read in the selection table for repeatedly sampled individuals.
ccs_fi <- read.csv(file.path(i_path, "indiv_scale_calls_preprocessed.csv"), header = TRUE)
# str(ccs_fi)
How many recordings per repeatedly sampled individual?
tapply(ccs_fi$sound.files, ccs_fi$indiv, function(X){
length(unique(X))
})
## AAT RAW UM1 UM2 UM3 UM4 UM5 ZW8
## 5 2 2 1 1 1 1 4
Merge the selection table with metadata from whole recordings.
meta <- read.csv(file.path(h_path, "ParakeetRecordings_MASTER_03_14_18_GSV.csv"), header = TRUE)
str(meta)
## 'data.frame': 945 obs. of 16 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ Month : int 5 5 5 5 5 6 6 6 6 6 ...
## $ Day : int 7 21 21 21 21 19 19 19 19 19 ...
## $ Time : Factor w/ 305 levels "",".","06:02",..: 15 144 103 78 71 189 190 197 204 211 ...
## $ DataEntry_Initials : Factor w/ 3 levels "CH","CH/GSV",..: 3 3 3 3 3 2 2 2 2 2 ...
## $ Site_ID : Factor w/ 48 levels "1145-01","1145-02",..: 4 48 29 38 38 34 34 34 34 34 ...
## $ Recordist : Factor w/ 3 levels "CH","CH/GSV",..: 3 3 3 3 3 1 1 1 1 1 ...
## $ Raw_File_Name : Factor w/ 945 levels "1003.WAV","1004.WAV",..: 1 5 4 3 2 642 643 644 645 646 ...
## $ Site_Recording_File_Name : Factor w/ 945 levels "2017_05_07_ARAP_1003.WAV",..: 1 5 2 4 3 6 7 8 9 10 ...
## $ Focal_individual : Factor w/ 15 levels "173","9U4","AAT",..: NA NA NA NA NA NA NA NA NA NA ...
## $ FocalIndividual_Recording_File_Name: Factor w/ 25 levels "2017_07_17_EMBR_INDIV-173_MZ000191.WAV",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Site_selections : Factor w/ 2 levels "N","Y": 2 1 1 2 1 1 2 2 2 2 ...
## $ Alarm_call : Factor w/ 3 levels "","N","Y": 2 1 1 3 1 2 3 3 2 2 ...
## $ Time_Alarm_Call : Factor w/ 40 levels "","0.75s, 8.1s, 22.4",..: 16 1 1 22 1 NA 39 26 NA NA ...
## $ Notes : Factor w/ 640 levels "","1 call each from birds 7 and 8 circling out from nests and quickly returning, some noise from other avian speci"| __truncated__,..: 125 436 518 121 410 395 623 252 127 31 ...
ccs_fi2 <- ccs_fi %>%
left_join(
meta %>%
dplyr::select(FocalIndividual_Recording_File_Name, Time)
, by = c("sound.files" = "FocalIndividual_Recording_File_Name")
) %>%
droplevels()
## Warning: Column `sound.files`/`FocalIndividual_Recording_File_Name` joining
## factors with different levels, coercing to character vector
# str(ccs_fi2)
How many recordings have time of day recorded per individual?
tmp <- ccs_fi2 %>%
filter(Time != "")
tapply(tmp$sound.files, tmp$indiv, function(X){
length(unique(X))
})
## AAT RAW UM1 UM2 UM3 UM4 UM5 ZW8
## 1 NA 2 NA NA NA 1 NA
How long are each of these recordings (in minutes) per individual?
wavs <- tmp %>%
dplyr::select(sound.files) %>%
distinct() %>%
pull()
dur_df <- rbindlist(lapply(1:length(wavs), function(x){
wav_dur(wavs[x], path = i_path)
}))
dur_df %>%
mutate(
duration = round(duration/60, digits = 2)
)
## sound.files duration
## 1 2017_07_28_1145-01_INDIV-UNMARKED-BIRD1_1192.WAV 2.55
## 2 2017_07_28_1145-01_INDIV-UNMARKED-BIRD1_1193.WAV 5.95
## 3 2017_07_29_1145-02_INDIV-AAT_1224.WAV 5.74
## 4 2017_08_21_CHAC_INDIV-UNMARKED-BIRD5_1242.WAV 3.40
We recorded Unmarked Bird 1 over 8.50 minutes, Unmarked Bird 5 over 3.40 minutes and marked bird AAT over 5.74 minutes. All other repeatedly sampled individuals were recorded over a single day, and typically over a 2 hour time frame (CHECK FIELD NOTEBOOKS).
Although we did not have times for every single recording, we could still perform an analysis to ask whether acoustic similarity changed over these recording periods. We had selected contact calls sequentially within and across recordings, so calls could be assigned integer values in consecutive order to represent sampling time. Marked bird AAT was recorded by 2 recordists, and we confirmed the order of recordings by listening to original .wav files (calls selected in 2017_07_29_1145-02_INDIV-AAT_1224.WAV came first).
ccs_fi2 <- ccs_fi %>%
# make sure the levels of the individual ID column line up with the order in data frame
mutate(
indiv = factor(indiv, levels = unique(ccs_fi$indiv))
) %>%
mutate(
call_order = unlist(tapply(sound.files, indiv, function(X){
seq(1, length(X), 1)
})))
# looks good
# View(ccs_fi2)
Perform Mantel tests of acoustic and sequence distance per repeatedly sampled individual.
# make a vector of unique individual IDs
indivs <- as.character(unique(ccs_fi2$indiv))
indivs
# read in the matrix of SPCC similarity for repeatedly sampled individuals
xc_mat_i <- readRDS(file.path(i_path, "xc_mat_indiv_scale.RDS"))
# str(xc_mat_i)
# replace dimnames of each matrix with individual IDs
rsindiv_IDs <- gsub("BIRD", "UM", sapply(1:length(dimnames(xc_mat_i)[[1]]), function(x){
if(grepl("UNM", dimnames(xc_mat_i)[[1]][x])){
strsplit(strsplit(dimnames(xc_mat_i)[[1]][x], split = "_")[[1]][5], split = "-")[[1]][3]
} else {
strsplit(strsplit(dimnames(xc_mat_i)[[1]][x], split = "_")[[1]][5], split = "-")[[1]][2]
}
}))
dimnames(xc_mat_i) <- list(rsindiv_IDs, rsindiv_IDs)
# str(xc_mat_i)
# convert to a distance matrix
xc_mat_i <- 1 - xc_mat_i
# str(xc_mat_i)
# set permutations and cores for Mantel tests
perms <- 9999
cores <- parallel::detectCores() - 2
# i <- 1 # testing
seq_simil <- rbindlist(invisible(pblapply(1:length(indivs), function(i){
# make a matrix of sequence distance per individual
tmp <- ccs_fi2 %>%
filter(indiv == indivs[i]) %>%
pull(call_order)
seq_dist_mat <- stats::dist(tmp, diag = TRUE, upper = TRUE)
# subset the SPCC distance matrix by the given individual
pat <- grep(paste("^", indivs[i], "$", sep = ""), dimnames(xc_mat_i)[[1]])
tmp_xc_mat <- xc_mat_i[pat, pat]
# perform a Mantel test
mntl_res <- vegan::mantel(xdis = seq_dist_mat, ydis = tmp_xc_mat, method = "pearson", permutations = perms, parallel = cores)
return(data.frame(indiv = indivs[i], num_calls = length(tmp), mantel_r = round(mntl_res$statistic, digits = 2), mantel_p = round(mntl_res$signif, digits = 4), permutations = mntl_res$permutations))
})))
saveRDS(seq_simil, file.path(i_path, "Mantel_test_temporal_sequences.RDS"))
Print Mantel test results.
seq_simil <- readRDS(file.path(i_path, "Mantel_test_temporal_sequences.RDS"))
seq_simil
## indiv num_calls mantel_r mantel_p permutations
## 1: RAW 4 -0.14 0.5833 23
## 2: ZW8 8 -0.04 0.5525 9999
## 3: UM2 23 0.09 0.0813 9999
## 4: UM3 5 0.70 0.0500 119
## 5: UM4 13 -0.24 0.9747 9999
## 6: UM1 25 0.14 0.0407 9999
## 7: AAT 12 0.05 0.3491 9999
## 8: UM5 7 -0.31 0.9151 5039
Adjust alpha of 0.05 using a Bonferroni correction.
# the adjusted alpha by a Bonferroni correction:
alpha <- 0.05
adj_alpha <- round(alpha/nrow(seq_simil), digits = 4)
adj_alpha
## [1] 0.0062
There was no significant relationship between acoustic distance and temporal sequence distance of calls recorded per repeated individual. All p-values were greater than the adjusted alpha (see above).
We also ran this analysis for the unmarked individuals for which calls were contained in a single recording. Here, we used the start and end times of calls within recordings to calculate exact temporal distance among calls in a sequence.
tapply(ccs_fi$sound.files, ccs_fi$indiv, function(X){
length(unique(X))
})
indivs <- c("UM2", "UM3", "UM4", "UM5")
# i <- 1
# x <- 23
seq_simil <- rbindlist(invisible(pblapply(1:length(indivs), function(i){
# for each call for the given individual, find the distance to all other calls in the same recording
tmp <- ccs_fi %>%
filter(indiv == indivs[i])
# str(tmp)
# loop over each call, and make two subsets:
# calls that are before it and after it
# then calculate the temporal distance based on call placement
dists <- unlist(lapply(1:nrow(tmp), function(x){
b4 <- c(x, which(tmp$start < tmp$start[x]))
before <- b4[order(b4)]
after <- which(tmp$start > tmp$start[x])
if(length(before) > 0){
dists_b <- tmp$start[x] - tmp[before, "start"]
}
if(length(after) > 0){
dists_a <- tmp[after, "start"] - tmp$start[x]
}
if(length(before) > 0 & length(after) > 0){
return(c(dists_b, dists_a))
} else if(length(before) == 0){
return(c(dists_a))
} else if(length(after) == 0){
return(c(dists_b))
}
}))
# turn this vector into a symmetric matrix of temporal distance per individual
mat <- matrix(dists, byrow = TRUE, nrow = nrow(tmp), ncol = nrow(tmp))
# mat[1:10, 1:10] # checking, looks good
# convert to a distance object
seq_dist_mat <- stats::as.dist(mat, diag = TRUE, upper = TRUE)
# subset the SPCC distance matrix by the given individual
pat <- grep(paste("^", indivs[i], "$", sep = ""), dimnames(xc_mat_i)[[1]])
tmp_xc_mat <- xc_mat_i[pat, pat]
# perform a Mantel test
mntl_res <- vegan::mantel(xdis = seq_dist_mat, ydis = tmp_xc_mat, method = "pearson", permutations = perms, parallel = cores)
return(data.frame(indiv = indivs[i], num_calls = nrow(tmp), mantel_r = round(mntl_res$statistic, digits = 2), mantel_p = round(mntl_res$signif, digits = 4), permutations = mntl_res$permutations))
})))
saveRDS(seq_simil, file.path(i_path, "Mantel_test_temporal_sequences_unmarked.RDS"))
Print Mantel test results.
seq_simil <- readRDS(file.path(i_path, "Mantel_test_temporal_sequences_unmarked.RDS"))
seq_simil
## indiv num_calls mantel_r mantel_p permutations
## 1: UM2 23 0.09 0.0956 9999
## 2: UM3 5 0.57 0.0750 119
## 3: UM4 13 -0.15 0.8833 9999
## 4: UM5 7 -0.43 0.9722 5039
Adjust alpha of 0.05 using a Bonferroni correction.
# the adjusted alpha by a Bonferroni correction:
alpha <- 0.05
adj_alpha <- round(alpha/nrow(seq_simil), digits = 4)
adj_alpha
## [1] 0.0125