I am using the code below to fill a 3D array from another 3D array. I have used the sapply
function to apply the code lines at each individual (3rd dimension) as in Efficient way to fill a 3D array. Here is my code.
ind <- 1000 individuals <- as.character(seq(1, ind, by = 1)) maxCol <- 7 col <- 4 line <- 0 a <- 0 b <- 0 c <- 0 col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_F", "I_F", "R_F"), paste, sep="_"))) array1 <- array(sample(1:100, length(col_array), replace = T), dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID ## print(array1) col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_M", "I_M", "R_M"), paste, sep="_"))) array2 <- array(NA, dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID ## print(array2) tic("array2") array2 <- sapply(individuals, function(i){ ## Fill the first columns array2[line + 1, c("year", "time", "ID", "age"), i] <- c(a, b, i, c) ## Define column indexes for individuals S col_start_S_F <- which(colnames(array1[,,i])=="0_year_S_F") col_end_S_F <- which(colnames(array1[,,i])==paste(maxCol,"years_S_F", sep="_")) col_start_S_M <- which(colnames(array2[,,i])=="0_year_S_M") col_end_S_M <- which(colnames(array2[,,i])==paste(maxCol,"years_S_M", sep="_")) ## Fill the columns for individuals S p_S_M <- sapply(0:maxCol, function(x){pnorm(x, 4, 1)}) array2[line + 1, col_start_S_M:col_end_S_M, i] <- round(as.numeric(as.vector(array1[line + 1, col_start_S_F:col_end_S_F, i]))*p_S_M) ## Define column indexes for individuals I col_start_I_F <- which(colnames(array1[,,i])=="0_year_I_F") col_end_I_F <- which(colnames(array1[,,i])==paste(maxCol,"years_I_F", sep="_")) col_start_I_M <- which(colnames(array2[,,i])=="0_year_I_M") col_end_I_M <- which(colnames(array2[,,i])==paste(maxCol,"years_I_M", sep="_")) ## Fill the columns for individuals I p_I_M <- sapply(0:maxCol, function(x){pnorm(x, 2, 1)}) array2[line + 1, col_start_I_M:col_end_I_M, i] <- round(as.numeric(as.vector(array1[line + 1, col_start_I_F:col_end_I_F, i]))*p_I_M) ## Define column indexes for individuals R col_start_R_M <- which(colnames(array2[,,i])=="0_year_R_M") col_end_R_M <- which(colnames(array2[,,i])==paste(maxCol,"years_R_M", sep="_")) ## Fill the columns for individuals R array2[line + 1, col_start_R_M:col_end_R_M, i] <- as.numeric(as.vector(array2[line + 1, col_start_S_M:col_end_S_M, i])) + as.numeric(as.vector(array2[line + 1, col_start_I_M:col_end_I_M, i])) return(array2[,,i]) ## print(array2[,,i]) }, simplify = "array") ## print(array2) toc()
Is there a way to increase the performance/speed of my code (i.e., < 1 sec)? There are 500000 observations for the 3rd dimension. Any suggestions?
1 Answers
Answers 1
TL;DR: Here's a tidyverse solution that transforms the sample array into a dataframe and applies the requested changes. EDIT: I've added steps 1+2 to transform the original post's sample data into the format I used in step 3. The actual calculation in Step 3 is very fast (<0.1 sec), but the bottleneck is step 2, which takes 10 seconds for 500k rows.
Step 0: Create sample data for 500k individuals
ind <- 500000 individuals <- as.character(seq(1, ind, by = 1)) maxCol <- 7 col <- 4 line <- 0 a <- 0 b <- 0 c <- 0 col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_F", "I_F", "R_F"), paste, sep="_"))) array1 <- array(sample(1:100, length(col_array), replace = T), dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID dim(array1) # [1] 2 28 500000 # Two rows x 28 measures x 500k individuals
Step 1: Subset array and convert to data frame.
library(tidyverse) # OP only uses first line of array1. If other rows needed, replace with "array1 %>%" # and adjust renaming below to account for different Var1. array1_dt <- array1[1,,] %>% as.data.frame.table(stringsAsFactors = FALSE)
Step 2: Break out the stats into different columns, with one row for each individual-year. This is the slowest step (especially the spread
line), and takes 0.05 sec for 1000 individuals but 10 seconds for 500k. I expect a data.table solution could make it much faster, if needed.
array1_dt_reshape <- array1_dt %>% rename(stat = Var1, ID = Var2) %>% filter(!stat %in% c("year", "time", "ID", "age")) %>% mutate(year = stat %>% str_sub(end = 1), col = stat %>% str_sub(start = -3)) %>% select(-stat) %>% spread(col, Freq) %>% arrange(ID)
Step 3: Apply requested transformation. This function calculates the distribution with two sets of parameters, and uses these to scale the input table's columns. It takes 0.03 sec for 500k of individuals.
array_transform <- function(input_data = array1_dt_reshape, max_yr = 7, S_M_mean = 4, I_M_mean = 2) { tictoc::tic() # First calculate the distribution function values to apply to all individuals, # depending on year. p_S_M_vals <- sapply(0:max_yr, function(x){pnorm(x, S_M_mean, 1)}) p_I_M_vals <- sapply(0:max_yr, function(x){pnorm(x, I_M_mean, 1)}) # For each year, scale S_M + I_M by the respective distribution functions. # This solution relies on the fact that each ID has 8 rows every time, # so we can recycle the 8 values in the distribution functions. output <- input_data %>% # group_by(ID) %>% <-- Not needed mutate(S_M = S_F * p_S_M_vals, I_M = I_F * p_I_M_vals, R_M = S_M + I_M) # %>% ungroup <-- Not needed tictoc::toc() return(output) } array1_output <- array_transform(array1_dt_reshape)
Results
head(array1_output) ID year I_F R_F S_F S_M I_M R_M 1 1 0 16 76 23 7.284386e-04 0.3640021 0.3647305 2 1 1 46 96 80 1.079918e-01 7.2981417 7.4061335 3 1 2 27 57 76 1.729010e+00 13.5000000 15.2290100 4 1 3 42 64 96 1.523090e+01 35.3364793 50.5673837 5 1 4 74 44 57 2.850000e+01 72.3164902 100.8164902 6 1 5 89 90 64 5.384606e+01 88.8798591 142.7259228 7 1 6 23 16 44 4.299899e+01 22.9992716 65.9982658 8 1 7 80 46 90 8.987851e+01 79.9999771 169.8784862 9 2 0 16 76 23 7.284386e-04 0.3640021 0.3647305 10 2 1 46 96 80 1.079918e-01 7.2981417 7.406133
0 comments:
Post a Comment