I have a simple table with emp_id
and job_code
. I would like to return the correct payout
based on the job_code
I've managed this with nested ifelse's but what if I have more job_code
's?
library(dplyr) set.seed(1) emp_id <- round(rnorm(100, 500000, 10000)) job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE) result <- sample(c(1,2,3,4), 100, replace = TRUE) df <- data.frame(emp_id = emp_id, job_code = job_code, result = result) job_a <- c(0, 500, 1000, 5000) job_b <- c(0, 200, 500, 750) job_c <- c(0, 250, 750, 1000) # Works but sucky df %>% mutate(payout = ifelse(job_code == 'a', job_a[result], ifelse(job_code == 'b', job_b[result], job_c[result])))
and dput
if you prefer:
structure(list(emp_id = c(493735, 501836, 491644, 515953, 503295, 491795, 504874, 507383, 505758, 496946, 515118, 503898, 493788, 477853, 511249, 499551, 499838, 509438, 508212, 505939, 509190, 507821, 500746, 480106, 506198, 499439, 498442, 485292, 495218, 504179, 513587, 498972, 503877, 499462, 486229, 495850, 496057, 499407, 511000, 507632, 498355, 497466, 506970, 505567, 493112, 492925, 503646, 507685, 498877, 508811, 503981, 493880, 503411, 488706, 514330, 519804, 496328, 489559, 505697, 498649, 524016, 499608, 506897, 500280, 492567, 501888, 481950, 514656, 501533, 521726, 504755, 492901, 506107, 490659, 487464, 502914, 495567, 500011, 500743, 494105, 494313, 498648, 511781, 484764, 505939, 503330, 510631, 496958, 503700, 502671, 494575, 512079, 511604, 507002, 515868, 505585, 487234, 494267, 487754, 495266), job_code = structure(c(1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 2L, 3L, 3L, 2L, 1L, 1L, 1L, 2L, 3L, 2L, 1L, 1L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 2L, 3L, 1L, 1L, 1L, 3L), .Label = c("a", "b", "c"), class = "factor"), result = c(3, 1, 2, 2, 2, 4, 1, 4, 1, 2, 1, 1, 4, 3, 2, 2, 1, 2, 4, 3, 3, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 1, 2, 2, 1, 3, 4, 4, 1, 3, 2, 3, 2, 2, 1, 2, 3, 2, 1, 4, 2, 4, 2, 4, 1, 4, 2, 1, 2, 4, 2, 3, 4, 1, 3, 3, 2, 2, 3, 4, 1, 1, 2, 2, 4, 1, 2, 2, 3, 3, 4, 1, 1, 4, 4, 1, 4, 1, 1, 4, 3, 1, 2, 3, 2, 2, 1)), .Names = c("emp_id", "job_code", "result"), row.names = c(NA, -100L), class = "data.frame")
What I'd like to do ideally is have the payouts within a data.frame but not sure how to reference it properly:
job_payouts <- data.frame(a = job_a, b = job_b, c = job_c) # Won't work... df %>% mutate(payout = job_payouts$job_code[result])
3 Answers
Answers 1
Using tools from tidyverse:
library(dplyr) library(stringr) library(tidyr) # your data set.seed(1) emp_id <- round(rnorm(100, 500000, 10000)) job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE) result <- sample(c(1,2,3,4), 100, replace = TRUE) # construct a data frame df <- data.frame(emp_id = emp_id, job_code = job_code, result = result, stringsAsFactors = FALSE) # your jobs job_a <- c(0, 500, 1000, 5000) job_b <- c(0, 200, 500, 750) job_c <- c(0, 250, 750, 1000) # construct a data frame my_job <- data.frame(job_a, job_b, job_c) %>% gather(job, value) %>% group_by(job) %>% mutate(result = 1:n(), job_code = str_replace(job, "job_", "")) %>% ungroup %>% select(-job) # join df and my_job into my_results table my_results <- left_join(df, my_job)
Results:
my_results %>% tbl_df Source: local data frame [100 x 4] emp_id job_code result value (dbl) (chr) (dbl) (dbl) 1 493735 a 3 1000 2 501836 a 1 0 3 491644 b 2 200 4 515953 a 2 500 5 503295 a 2 500 6 491795 b 4 750 7 504874 b 1 0 8 507383 a 4 5000 9 505758 a 1 0 10 496946 c 2 250 .. ... ... ... ...
Answers 2
This can be achieved through the super cool method of matrix indexing in base R which is extremely fast and efficient.
# build jobs payout lookup matrix, by hand (see edit below for an extension) jobs <- rbind(job_a, job_b, job_c) # add row names to the matrix for convenient reference rownames(jobs) <- levels(df$job_code) # get payout using matrix indexing df$payout <- jobs[cbind(df$job_code, df$result)]
This returns
# print out first 6 observations head(df) emp_id job_code result payout 1 493735 a 3 1000 2 501836 a 1 0 3 491644 b 2 200 4 515953 a 2 500 5 503295 a 2 500 6 491795 b 4 750 # print out jobs matrix for comparison jobs [,1] [,2] [,3] [,4] a 0 500 1000 5000 b 0 200 500 750 c 0 250 750 1000
There are a couple of details that worth mentioning.
- The
data.frame
function converts the job_code character vector, so thatdf$job_code
is a factor variable where labels are associated with the natural numbers 1, 2, 3, ... By default, levels of the factor are ordered alphabetically by label so, in this example, the label "a" corresponds to 1, "b" to 2, and "c" to 3. You can use thelevels
function to find the order of the factor variable and construct the jobs matrix following that template. - The jobs matrix is used as a lookup table. It is constructed so that these integers refer to row numbers of the jobs matrix. Then, the columns can be subset as you do with the original payout vectors.
cbind(df$job_code, df$result)
forms a 2 bynrow(df)
(100) matrix which is used to look up thenrow(df)
payoff values for each employee from the jobs matrix using matrix indexing. The R intro manual has a nice intro section on matrix indexing and additional details can be found inhelp("[")
.
Edit: Automating the construction of the lookup matrix
In the comments to this answer, the OP expresses concern that building the lookup matrix (which I called "jobs"), by hand would be tedious and prone to error. To address these valid concerns, we can use a somewhat obscure argument to the mget
function, "ifnotfound." This argument allows us to control the output of elements of the list that mget
returns when they are present in the vector of names, but not present in the environment.
In the comments, I suggested using NA
to fill in missing levels in the comment below. We can extend this by using NA
as the input for "ifnotfound."
Suppose df$job_code
is a factor that has levels "a", "aa", "b", and "c" in that order. Then we build the look up matrix as follows:
# build vector for example, the actual code, using levels(), follows as a comment job_codes <- c("a", "aa", "b", "c") # job_codes <- levels(df$jobcodes) # get ordered list of payouts, with NA for missing payouts payoutList <- mget(paste0("job_", job_codes), ifnotfound=NA)
which returns a named list.
payoutList $job_a [1] 0 500 1000 5000 $job_aa [1] NA $job_b [1] 0 200 500 750 $job_c [1] 0 250 750 1000
Note that payoutList$job_aa
is a single NA. Now, build the matrix from this list.
# build lookup matrix using do.call() and rbind() jobs.lookupMat <- do.call(rbind, payoutList) jobs.lookupMat [,1] [,2] [,3] [,4] job_a 0 500 1000 5000 job_aa NA NA NA NA job_b 0 200 500 750 job_c 0 250 750 1000
The rows of the matrix are properly ordered according to the levels of the factor df$job_code
, conveniently named, and NA
s fill in rows wherever there is no payout.
Answers 3
Without changing your data structure, you can do this by defining a function:
job_search <- function(code){ var_name <- paste0("job_",code) if (exists(var_name)){ return(get(var_name)) }else{ return(NA) } } library(data.table) setDT(df) df[, payout := job_search(job_code)[result], by = .(emp_id)] df emp_id job_code result payout 1: 493735 a 3 1000 2: 501836 a 1 0 3: 491644 b 2 200 4: 515953 a 2 500 5: 503295 a 2 500 6: 491795 b 4 750 7: 504874 b 1 0 8: 507383 a 4 5000 9: 505758 a 1 0 10: 496946 c 2 250 11: 515118 c 1 0 12: 503898 a 1 0 ...
However, this is a fairly unstable way to keep your data, and the paste + get syntax is convoluted.
A better way to store your data would be in a lookup table:
library(data.table) job_a <- data.frame(payout = c(0, 500, 1000, 5000)) job_b <- data.frame(payout = c(0, 200, 500, 750)) job_c <- data.frame(payout = c(0, 250, 750, 1000)) job_lookup <- rbindlist( #this is a data.table l = list(a = job_a,b = job_b,c = job_c), idcol = TRUE ) # create your result index job_lookup[, result := 1:.N, by = .id] job_lookup .id payout result 1: a 0 1 2: a 500 2 3: a 1000 3 4: a 5000 4 5: b 0 1 6: b 200 2 7: b 500 3 8: b 750 4 9: c 0 1 10: c 250 2 11: c 750 3 12: c 1000 4 # merge to your initial data.frame merge(df, job_lookup, by.x = c("job_code","result"), by.y = c(".id","result"), all.x = TRUE) job_code result emp_id payout 1 a 1 505758 0 2 a 1 501836 0 3 a 1 503898 0 4 a 1 494575 0 5 a 1 487464 0 6 a 1 503700 0 7 a 1 505939 0 8 a 1 503330 0 9 a 1 512079 0 10 a 1 481950 0 11 a 1 507685 0 12 a 1 490659 0 ...
0 comments:
Post a Comment