-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_stratified_sampling_chunks_BirdNET.Rmd
160 lines (120 loc) · 6.25 KB
/
example_stratified_sampling_chunks_BirdNET.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
---
title: "Demo stratified sampling clips based on BirdNET output"
output:
rmarkdown::github_document
---
Example code for reading BirdNET output and exporting a stratified sample of detections in species subfolders. Here stratification is done by site, week and species. The method could be extended to include sampling by confidence score, month, time of day or any other categories as required.
```{r}
#devtools::install_github('BritishTrustForOrnithology/AcousticTools')
library(AcousticTools)
#collate some BirdNET outputs
df <- AcousticTools::read_birdnet_results(folder = "H:/Leiothrix Recordings")
#this location has too many for a simple example so I'm just going to limit it to 3 locations and 3 species for simplicity
df <- subset(df, grepl('ST403|ST7747|ST7847', df$original_wav))
df <- subset(df, birdnet_english_name %in% c('Barn Owl', 'Black-crowned Night-Heron', 'Brown Creeper'))
head(df)
```
Currently we have all detections. In this example I'm going to do some stratified sampling by site and week. First, split the file names into component parts. This assumes files are in recognised YYYYMMDD-HHMMSS-Location-Recordist-Microphone.wav format. First split the filename using stringr::str_split_fixed.
```{r}
library(stringr)
#split the original filename parts
bits <- setNames(as.data.frame(stringr::str_split_fixed(string = basename(df$original_wav), pattern = "-|\\.", n = Inf)),
c('date_str', 'time_str', 'loc', 'rec', 'mic','ext'))
head(bits)
#join to the main df
df <- cbind(df, bits)
```
As before, I recommend extracting the date and time as a datetime variable. This allows for making the detection times, but also means we can use the date objects to do temporal sampling, e.g. here making a week number variable for later stratification
Then the detection offset (start) can be added to get the start datetime of the detection. Now convert this to a date using lubridate and add the start time offset for the detection.
```{r}
library(lubridate)
#extract the datetime of the original recording
df$recording_start_dt <- as.POSIXct(paste(df$date_str, df$time_str), format = "%Y%m%d %H%M%S")
#add seconds offset to get detecion datetime
df$detection_dt <- df$recording_start_dt + df$start
#make a week number variable
df$weeknum <- lubridate::week(df$detection_dt)
head(df[,c("original_wav", "start", "recording_start_dt", "detection_dt", "weeknum")])
```
Now do the stratified sampling. This can be done in nested loops but this gets messy the more strata there are. So I've tried to make this more easily scalable by creating a list (strata) that contains the unique permutations of location, week and species that we have in the data.
```{r}
#decide how many chunks per stratum
nchunks <- 5
#strata variables
strata_vars <- c('loc', 'weeknum', 'birdnet_english_name')
#get the unique permutations of these - essentially the folders we will need
#which columns contain these?
strata_col_indx <- which(names(df) %in% strata_vars)
#use col indices to get these columns and get unique set
strata <- unique(df[,strata_col_indx])
head(strata)
```
Now we can use this to do the sampling for each stratum. In this example there are two methods for sampling: 'random' sampling or 'tophits', where the latter just gives the N highest scoring detections. Ultimately, I can pass the output locations to the extract_chunk function which will automatically make any required folders.
```{r}
#how to select chunks
method <- 'tophits'
#method <- 'random'
#iterate over strata
#make a list to hold outputs
strata_out <- list()
for(i in 1:nrow(strata)) {
#get detections for this stratum
this_stratum <- strata[i,]
#this line will need modifying to match whatever strata vars the filtering is being done on
temp <- subset(df, loc == this_stratum$loc & weeknum == this_stratum$weeknum & birdnet_english_name == this_stratum$birdnet_english_name)
#if number of detections <= nchunks, keep all
if(nrow(temp) <= nchunks) {
stratum_out <- temp
}
#if number of detections > nchunks, use 'method' to select nchunks
if(nrow(temp) > nchunks) {
#random sample
if(method == 'random') {
stratum_out <- temp[sample(nrow(temp), nchunks), ]
}
#top scoring
if(method == 'tophits') {
temp <- temp[order(-temp$score),]
stratum_out <- temp[1:nchunks,]
}
}
#store output for this stratum
strata_out[[i]] <- stratum_out
#tidy up
rm(list=c('stratum_out', 'this_stratum', 'temp'))
} #end stratum loop
#unpack all stratum outputs to a flat df
df_sampled <- do.call(rbind, strata_out)
```
Now we can use the detection datetime to make a unique filename. And we can use the locs and species names to make folders
```{r}
#make the new filename. Use __ to infer unknown species at this point
df_sampled$newfilename <- paste0(format(df_sampled$detection_dt, "%Y%m%d-%H%M%S"),
'-',
df_sampled$loc,
'-',
df_sampled$rec,
'-',
df_sampled$mic,
'-__.',
df_sampled$ext)
#where will chunks be saved?
path_export <- 'C:/exports'
#make the full path and filename of each clip to be exported
#this line will need to modified for different strata, to make a folder level for each stratum variable
#here I've done site > species > week, but could be in different orders
df_sampled$chunk_fullname <- file.path(path_export, df_sampled$loc, df_sampled$birdnet_english_name, df_sampled$weeknum, df_sampled$newfilename)
head(df_sampled$chunk_fullname)
```
Now we can do the exporting using AcousticTools::extract_chunk. As we pass in the file_chunk which contains the site/species/weeknum this ensures all the chunks are made in their respective folders. extract_chunks automatically makes any folders as needed.
```{r}
#iterate over the required sampled chunks
for(i in 1:nrow(df_sampled)) {
AcousticTools::extract_chunk(file_wav = df_sampled$original_wav[i],
file_chunk = df_sampled$chunk_fullname[i],
start = df_sampled$start[i],
end = df_sampled$end[i],
chunk_duration = 5,
verbose = TRUE)
}
```