Tuesday, September 13, 2016

How can I reference a list based on a variable within a data.frame?

Leave a Comment

I have a simple table with emp_id and job_code. I would like to return the correct payout based on the job_code

I've managed this with nested ifelse's but what if I have more job_code's?

library(dplyr) set.seed(1)  emp_id   <- round(rnorm(100, 500000, 10000)) job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE) result   <- sample(c(1,2,3,4), 100, replace = TRUE)  df <- data.frame(emp_id = emp_id, job_code = job_code, result = result)  job_a <- c(0, 500, 1000, 5000) job_b <- c(0, 200, 500, 750) job_c <- c(0, 250, 750, 1000)  # Works but sucky df %>% mutate(payout = ifelse(job_code == 'a', job_a[result],   ifelse(job_code == 'b', job_b[result],     job_c[result]))) 

and dput if you prefer:

structure(list(emp_id = c(493735, 501836, 491644, 515953, 503295,  491795, 504874, 507383, 505758, 496946, 515118, 503898, 493788,  477853, 511249, 499551, 499838, 509438, 508212, 505939, 509190,  507821, 500746, 480106, 506198, 499439, 498442, 485292, 495218,  504179, 513587, 498972, 503877, 499462, 486229, 495850, 496057,  499407, 511000, 507632, 498355, 497466, 506970, 505567, 493112,  492925, 503646, 507685, 498877, 508811, 503981, 493880, 503411,  488706, 514330, 519804, 496328, 489559, 505697, 498649, 524016,  499608, 506897, 500280, 492567, 501888, 481950, 514656, 501533,  521726, 504755, 492901, 506107, 490659, 487464, 502914, 495567,  500011, 500743, 494105, 494313, 498648, 511781, 484764, 505939,  503330, 510631, 496958, 503700, 502671, 494575, 512079, 511604,  507002, 515868, 505585, 487234, 494267, 487754, 495266), job_code = structure(c(1L,  1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 2L,  3L, 3L, 2L, 1L, 1L, 1L, 2L, 3L, 2L, 1L, 1L, 2L, 3L, 2L, 1L, 2L,  2L, 2L, 3L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 1L, 2L,  3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 3L, 3L,  1L, 1L, 3L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L,  2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 2L, 3L, 1L,  1L, 1L, 3L), .Label = c("a", "b", "c"), class = "factor"), result = c(3,  1, 2, 2, 2, 4, 1, 4, 1, 2, 1, 1, 4, 3, 2, 2, 1, 2, 4, 3, 3, 2,  2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 1, 2, 2, 1, 3, 4, 4, 1, 3,  2, 3, 2, 2, 1, 2, 3, 2, 1, 4, 2, 4, 2, 4, 1, 4, 2, 1, 2, 4, 2,  3, 4, 1, 3, 3, 2, 2, 3, 4, 1, 1, 2, 2, 4, 1, 2, 2, 3, 3, 4, 1,  1, 4, 4, 1, 4, 1, 1, 4, 3, 1, 2, 3, 2, 2, 1)), .Names = c("emp_id",  "job_code", "result"), row.names = c(NA, -100L), class = "data.frame") 

What I'd like to do ideally is have the payouts within a data.frame but not sure how to reference it properly:

job_payouts <- data.frame(a = job_a, b = job_b, c = job_c) # Won't work... df %>% mutate(payout = job_payouts$job_code[result]) 

3 Answers

Answers 1

Using tools from tidyverse:

library(dplyr) library(stringr) library(tidyr)  # your data set.seed(1)  emp_id <- round(rnorm(100, 500000, 10000)) job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE) result <- sample(c(1,2,3,4), 100, replace = TRUE)  # construct a data frame df <-    data.frame(emp_id = emp_id,              job_code = job_code,               result = result,              stringsAsFactors = FALSE)  # your jobs job_a <- c(0, 500, 1000, 5000) job_b <- c(0, 200, 500, 750) job_c <- c(0, 250, 750, 1000)  # construct a data frame my_job <-    data.frame(job_a, job_b, job_c) %>%    gather(job, value) %>%    group_by(job) %>%    mutate(result = 1:n(),          job_code = str_replace(job, "job_", "")) %>%    ungroup %>%    select(-job)  # join df and my_job into my_results table my_results <-   left_join(df, my_job) 

Results:

my_results %>% tbl_df  Source: local data frame [100 x 4]     emp_id job_code result value     (dbl)    (chr)  (dbl) (dbl) 1  493735        a      3  1000 2  501836        a      1     0 3  491644        b      2   200 4  515953        a      2   500 5  503295        a      2   500 6  491795        b      4   750 7  504874        b      1     0 8  507383        a      4  5000 9  505758        a      1     0 10 496946        c      2   250 ..    ...      ...    ...   ... 

Answers 2

This can be achieved through the super cool method of matrix indexing in base R which is extremely fast and efficient.

# build jobs payout lookup matrix, by hand (see edit below for an extension) jobs <- rbind(job_a, job_b, job_c)  # add row names to the matrix for convenient reference rownames(jobs) <- levels(df$job_code)  # get payout using matrix indexing df$payout <- jobs[cbind(df$job_code, df$result)] 

This returns

# print out first 6 observations head(df)   emp_id job_code result payout 1 493735        a      3   1000 2 501836        a      1      0 3 491644        b      2    200 4 515953        a      2    500 5 503295        a      2    500 6 491795        b      4    750  # print out jobs matrix for comparison jobs   [,1] [,2] [,3] [,4] a    0  500 1000 5000 b    0  200  500  750 c    0  250  750 1000 

There are a couple of details that worth mentioning.

  1. The data.frame function converts the job_code character vector, so that df$job_code is a factor variable where labels are associated with the natural numbers 1, 2, 3, ... By default, levels of the factor are ordered alphabetically by label so, in this example, the label "a" corresponds to 1, "b" to 2, and "c" to 3. You can use the levels function to find the order of the factor variable and construct the jobs matrix following that template.
  2. The jobs matrix is used as a lookup table. It is constructed so that these integers refer to row numbers of the jobs matrix. Then, the columns can be subset as you do with the original payout vectors.
  3. cbind(df$job_code, df$result) forms a 2 by nrow(df) (100) matrix which is used to look up the nrow(df) payoff values for each employee from the jobs matrix using matrix indexing. The R intro manual has a nice intro section on matrix indexing and additional details can be found in help("[").

Edit: Automating the construction of the lookup matrix

In the comments to this answer, the OP expresses concern that building the lookup matrix (which I called "jobs"), by hand would be tedious and prone to error. To address these valid concerns, we can use a somewhat obscure argument to the mget function, "ifnotfound." This argument allows us to control the output of elements of the list that mget returns when they are present in the vector of names, but not present in the environment.

In the comments, I suggested using NA to fill in missing levels in the comment below. We can extend this by using NA as the input for "ifnotfound."

Suppose df$job_code is a factor that has levels "a", "aa", "b", and "c" in that order. Then we build the look up matrix as follows:

# build vector for example, the actual code, using levels(), follows as a comment job_codes <- c("a", "aa", "b", "c") # job_codes <- levels(df$jobcodes)  # get ordered list of payouts, with NA for missing payouts payoutList <- mget(paste0("job_", job_codes), ifnotfound=NA) 

which returns a named list.

payoutList $job_a [1]    0  500 1000 5000  $job_aa [1] NA  $job_b [1]   0 200 500 750  $job_c [1]    0  250  750 1000 

Note that payoutList$job_aa is a single NA. Now, build the matrix from this list.

# build lookup matrix using do.call() and rbind() jobs.lookupMat <- do.call(rbind, payoutList)  jobs.lookupMat        [,1] [,2] [,3] [,4] job_a     0  500 1000 5000 job_aa   NA   NA   NA   NA job_b     0  200  500  750 job_c     0  250  750 1000 

The rows of the matrix are properly ordered according to the levels of the factor df$job_code, conveniently named, and NAs fill in rows wherever there is no payout.

Answers 3

Without changing your data structure, you can do this by defining a function:

job_search <- function(code){   var_name <- paste0("job_",code)   if (exists(var_name)){     return(get(var_name))   }else{     return(NA)   } }  library(data.table) setDT(df) df[, payout := job_search(job_code)[result], by = .(emp_id)] df         emp_id job_code result payout   1: 493735        a      3   1000   2: 501836        a      1      0   3: 491644        b      2    200   4: 515953        a      2    500   5: 503295        a      2    500   6: 491795        b      4    750   7: 504874        b      1      0   8: 507383        a      4   5000   9: 505758        a      1      0  10: 496946        c      2    250  11: 515118        c      1      0  12: 503898        a      1      0  ... 

However, this is a fairly unstable way to keep your data, and the paste + get syntax is convoluted.

A better way to store your data would be in a lookup table:

library(data.table)  job_a <- data.frame(payout = c(0, 500, 1000, 5000)) job_b <- data.frame(payout = c(0, 200, 500, 750)) job_c <- data.frame(payout = c(0, 250, 750, 1000))  job_lookup <- rbindlist( #this is a data.table   l = list(a = job_a,b = job_b,c = job_c),   idcol = TRUE )  # create your result index job_lookup[, result := 1:.N, by = .id] job_lookup     .id payout result  1:   a      0     1  2:   a    500     2  3:   a   1000     3  4:   a   5000     4  5:   b      0     1  6:   b    200     2  7:   b    500     3  8:   b    750     4  9:   c      0     1 10:   c    250     2 11:   c    750     3 12:   c   1000     4  # merge to your initial data.frame merge(df, job_lookup, by.x = c("job_code","result"), by.y = c(".id","result"), all.x = TRUE)      job_code result emp_id payout 1          a      1 505758      0 2          a      1 501836      0 3          a      1 503898      0 4          a      1 494575      0 5          a      1 487464      0 6          a      1 503700      0 7          a      1 505939      0 8          a      1 503330      0 9          a      1 512079      0 10         a      1 481950      0 11         a      1 507685      0 12         a      1 490659      0 ...         
If You Enjoyed This, Take 5 Seconds To Share It

0 comments:

Post a Comment