########################################################################### # Non-Linear Decomposition Technique for Logit or Probit Model # # Randomized Ordering of Variables # # Originally developed in: Fairlie, Robert. 1999. "The Absence of the # # African-American Owned Business: An Analysis of the Dynamics of Self- # # Employment," Journal of Labor Economics, 17(1): 80-108 # # Latest Revision: Fairlie, Robert. 2017. "Addressing Path Dependence and # # Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca # # Decomposition Technique for Logit, Probit and Other Nonlinear Models" # # Stanford Institute for Economic Policy Research, Working Paper 17-013 # # Translated to R by Beatriz Rache and RF in May 2025 # ########################################################################### # NOTES: # Currently set to randomize ordering of groups of variables, see NOTE:ORDERING to change # Currently set for use with sample weights, see NOTE:WEIGHTS to change # Currently set for pooled (all races) coefficient estimates, see NOTE:POOLED to change # Currently set for 100 iterations, see NOTE:ITERATIONS to change # Currently set for Logit model, see NOTE:PROBIT to change # Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/ # Set directory rm(list=ls()) # NEEDS TO BE CHANGED setwd('C:/Users/rfairlie/temp/decomp/r2024') t0<-Sys.time() # Set seed set.seed(2024) # Load packages require(dplyr) require(readxl) require(xlsx) require(stats) require(stringr) # Specify number of iterations # NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer numiterations <- 10 # NOTE:POOLED define race variables to be included only in pooled logits r <- 4 racevars <- c("black", "latino", "natamer", "asian") # Specify total number of independent variables, not including race dummies above k <- 31 # Define categories of variables for decomposition group1 <- c("female", "age") group2 <- c("married", "prevmar", "children", "chld617") group3 <- c("hsgrad", "somecol", "college", "gradsch") group4 <- c("inc1015", "inc1520", "inc2025", "inc2530", "inc3035", "inc3540", "inc4050", "inc5060", "inc6075", "incgt75") group5 <- c("midatlan", "encent", "wncent", "satlan", "escent", "wscent", "mountain", "pacific") group6 <- c("notcc", "notmsa", "notid") # Define short labels for each group of variables for final table output labelgroup <- c("Gender/Age", "Family", "Education", "Income", "Region", "City") # Rename minority group variables mgroup1 <- c("mfemale", "mage") mgroup2 <- c("mmarried", "mprevmar", "mchildren", "mchld617") mgroup3 <- c("mhsgrad", "msomecol", "mcollege", "mgradsch") mgroup4 <- c("minc1015", "minc1520", "minc2025", "minc2530", "minc3035", "minc3540", "minc4050", "minc5060", "minc6075", "mincgt75") mgroup5 <- c("mmidatlan", "mencent", "mwncent", "msatlan", "mescent", "mwscent", "mmountain", "mpacific") mgroup6 <- c("mnotcc", "mnotmsa", "mnotid") # Rename reference group (white) variables wgroup1 <- c("wfemale", "wage") wgroup2 <- c("wmarried", "wprevmar", "wchildren", "wchld617") wgroup3 <- c("whsgrad", "wsomecol", "wcollege", "wgradsch") wgroup4 <- c("winc1015", "winc1520", "winc2025", "winc2530", "winc3035", "winc3540", "winc4050", "winc5060", "winc6075", "wincgt75") wgroup5 <- c("wmidatlan", "wencent", "wwncent", "wsatlan", "wescent", "wwscent", "wmountain", "wpacific") wgroup6 <- c("wnotcc", "wnotmsa", "wnotid") # Combine groups vars <- c(group1, group2, group3, group4, group5, group6) mvars <- c(mgroup1, mgroup2, mgroup3, mgroup4, mgroup5, mgroup6) wvars <- c(wgroup1, wgroup2, wgroup3, wgroup4, wgroup5, wgroup6) # Prepare original data for program # NEEDS TO BE CHANGED temp <- read.csv("C:/Users/rfairlie/temp/decomp/finaldecomp00.csv")%>% # Define dependent variable mutate(y = homecomp)%>% # Delete observations with any missing values for dependent or independent variables filter(!is.na(y) & !is.na(hsgrad) & !is.na(inc1015))%>% # NOTE:WEIGHTS, if no sample weights in data then set wgt to 1 # mutate(wgt = 1)%>% # Delete observations with missing, zero or negative sample weights filter(!is.na(wgt) & !wgt <= 0)%>% # Define sample - e.g. only keep working-age adults for this run filter(age >= 25 & age <= 55)%>% mutate(wgt = wgt/mean(wgt)) # Define subset of data for estimating coefficients # NOTE:POOLED, Currently set to use pooled sample (all racial groups) to estimate coefficents # Could be changed to use sample for only one group (e.g. white or black coefficients) regdata <- temp%>% mutate(y = as.integer(y)) # filter(white==1) # filter(black==1) # create minority sample with minority variable names # Define which minority group is used minority <- temp%>% filter(black == 1)%>% select(ym = y, wgtm = wgt, all_of(vars)) # Create full reference group (white) sample with reference group (white) variable names white <- temp%>% filter(white == 1)%>% select(yw = y, wgtw = wgt, all_of(vars)) # Print out full sample means minority_means <- minority %>% summarise(across(everything(), ~ weighted.mean(.x, wgtm, na.rm = TRUE))) white_means <- white %>% summarise(across(everything(), ~ weighted.mean(.x, wgtw, na.rm = TRUE))) # Calculate means of dependent variables for full sample # These values are used to calculate the total gap in the decomposition ymdata <- minority %>% summarise(ymfull = weighted.mean(ym, wgtm, na.rm = TRUE), mn = n()) ywdata <- white %>% summarise(ywfull = weighted.mean(yw, wgtw, na.rm = TRUE), wn = n()) # define global variables for full white and full minority sample sizes mobs <- ymdata$mn wobs <- ywdata$wn # Estimate logit model to obtain coefficients # NOTE:POOLED set for pooled or specific group sample above formula <- paste0('y ~ ',paste(c(racevars,vars), collapse = '+')) logit_model <- suppressWarnings(glm(as.formula(formula), family = binomial(link = "logit"), data = regdata, weights = wgt)) orgcoefs <- coef(logit_model) cov_matrix <- vcov(logit_model) # Remove race dummies from coefficient dataset # NOTE:POOLED only need this for pooled estimates coefs <- orgcoefs[!names(orgcoefs)%in%racevars] # Calculate predicted probabilities for both samples # NOTE:PROBIT, need to change functional form here and below white <- white %>% mutate(xbeta = as.numeric(cbind(1, as.matrix(select(., all_of(vars)))) %*% as.numeric(coefs)), wordprob = exp(xbeta) / (1 + exp(xbeta))) # NOTE:PROBIT use normal distribution for probit minority <- minority %>% mutate(xbeta = as.numeric(cbind(1, as.matrix(select(., all_of(vars)))) %*% as.numeric(coefs)), mordprob = exp(xbeta) / (1 + exp(xbeta))) # Create empty starting dataset for iterations final <- data.frame() means2 <- data.frame() # Define macro for iterations simulate <- function() { for (it in 1:numiterations) { rename_vector_w <- setNames(wvars,vars) white1 <- white %>% sample_n(mobs, replace = T, weight = wgtw) white1 <- white1 %>% mutate(random1 = runif(n())) white2 <- white1 %>% arrange(random1) %>% select(-c(xbeta))%>% rename_at(vars(names(rename_vector_w)),~rename_vector_w) rename_vector_m <- setNames(mvars,vars) minority1 <- minority %>% sample_n(mobs, replace = FALSE, weight = wgtm) %>% select(-c(xbeta))%>% rename_at(vars(names(rename_vector_m)),~rename_vector_m) # Merge datasets together for matching, random matching; combined <- white2 %>% cbind(minority1) # NOTE:ORDERING # Randomly assign variable groups ("defined groups") to "number groups" for random ordering of variable groups; rand <- tibble(i = 1:6, r = runif(6))%>% mutate(ordgroup = ntile(r, 6)) # for random order # mutate(ordgroup = i) # change to original order # mutate(ordgroup = 7 - i) # change to reverse order rand_order <- sample(1:6) # Define new variable groups based on random ordering new_group1 <- get(paste0('group',rand[rand$ordgroup == 1,'i'])) new_group2 <- get(paste0('group',rand[rand$ordgroup == 2,'i'])) new_group3 <- get(paste0('group',rand[rand$ordgroup == 3,'i'])) new_group4 <- get(paste0('group',rand[rand$ordgroup == 4,'i'])) new_group5 <- get(paste0('group',rand[rand$ordgroup == 5,'i'])) new_group6 <- get(paste0('group',rand[rand$ordgroup == 6,'i'])) w_new_group1<-paste0('w',new_group1) w_new_group2<-paste0('w',new_group2) w_new_group3<-paste0('w',new_group3) w_new_group4<-paste0('w',new_group4) w_new_group5<-paste0('w',new_group5) w_new_group6<-paste0('w',new_group6) m_new_group1<-paste0('m',new_group1) m_new_group2<-paste0('m',new_group2) m_new_group3<-paste0('m',new_group3) m_new_group4<-paste0('m',new_group4) m_new_group5<-paste0('m',new_group5) m_new_group6<-paste0('m',new_group6) # Assign new variable groups in a new order newvars <- c(new_group1, new_group2, new_group3, new_group4, new_group5, new_group6) # Create a named list for new coefficient assignment newcoefsa <- setNames(numeric(k+1), c("(Intercept)",newvars)) # Fill in with coefficient estimates newcoefsa[names(newcoefsa) %in% names(coefs)] <- coefs[names(newcoefsa)] # Define distribution switches x0a = combined %>% select(all_of(w_new_group1), all_of(w_new_group2), all_of(w_new_group3), all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6)) x1a = combined %>% select(all_of(m_new_group1), all_of(w_new_group2), all_of(w_new_group3), all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6)) x2a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(w_new_group3), all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6)) x3a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3), all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6)) x4a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3), all_of(m_new_group4), all_of(w_new_group5), all_of(w_new_group6)) x5a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3), all_of(m_new_group4), all_of(m_new_group5), all_of(w_new_group6)) x6a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3), all_of(m_new_group4), all_of(m_new_group5), all_of(m_new_group6)) # Perform white to black variable distribution switches combined <- combined %>% mutate(xb0 = cbind(1, as.matrix(x0a)) %*% as.numeric(newcoefsa), xb1 = cbind(1, as.matrix(x1a)) %*% as.numeric(newcoefsa), xb2 = cbind(1, as.matrix(x2a)) %*% as.numeric(newcoefsa), xb3 = cbind(1, as.matrix(x3a)) %*% as.numeric(newcoefsa), xb4 = cbind(1, as.matrix(x4a)) %*% as.numeric(newcoefsa), xb5 = cbind(1, as.matrix(x5a)) %*% as.numeric(newcoefsa), xb6 = cbind(1, as.matrix(x6a)) %*% as.numeric(newcoefsa)) # Calculate various predicted probabilities combined <- combined %>% mutate( pred0 = exp(xb0) / (1 + exp(xb0)), pred1 = exp(xb1) / (1 + exp(xb1)), pred2 = exp(xb2) / (1 + exp(xb2)), pred3 = exp(xb3) / (1 + exp(xb3)), pred4 = exp(xb4) / (1 + exp(xb4)), pred5 = exp(xb5) / (1 + exp(xb5)), pred6 = exp(xb6) / (1 + exp(xb6)) ) # Calculate various pdfs for standard error calculations # NOTE:PROBIT update for normal distribution combined <- combined %>% mutate( fhat0 = pred0 * (1 - pred0), fhat1 = pred1 * (1 - pred1), fhat2 = pred2 * (1 - pred2), fhat3 = pred3 * (1 - pred3), fhat4 = pred4 * (1 - pred4), fhat5 = pred5 * (1 - pred5), fhat6 = pred6 * (1 - pred6) ) # Create intercept component to derivatives dc1db0 = combined$fhat0 - combined$fhat1 dc2db0 = combined$fhat1 - combined$fhat2 dc3db0 = combined$fhat2 - combined$fhat3 dc4db0 = combined$fhat3 - combined$fhat4 dc5db0 = combined$fhat4 - combined$fhat5 dc6db0 = combined$fhat5 - combined$fhat6 dc1db<-data.frame(dc1db0) dc2db<-data.frame(dc2db0) dc3db<-data.frame(dc3db0) dc4db<-data.frame(dc4db0) dc5db<-data.frame(dc5db0) dc6db<-data.frame(dc6db0) for (v in 1:k) { assign(paste0("dc1db", v), combined$fhat0 * x0a[v] - combined$fhat1 * x1a[v]) dc1db<-cbind(dc1db,get(paste0('dc1db',v))) assign(paste0("dc2db", v), combined$fhat1 * x1a[v] - combined$fhat2 * x2a[v]) dc2db<-cbind(dc2db,get(paste0('dc2db',v))) assign(paste0("dc3db", v), combined$fhat2 * x2a[v] - combined$fhat3 * x3a[v]) dc3db<-cbind(dc3db,get(paste0('dc3db',v))) assign(paste0("dc4db", v), combined$fhat3 * x3a[v] - combined$fhat4 * x4a[v]) dc4db<-cbind(dc4db,get(paste0('dc4db',v))) assign(paste0("dc5db", v), combined$fhat4 * x4a[v] - combined$fhat5 * x5a[v]) dc5db<-cbind(dc5db,get(paste0('dc5db',v))) assign(paste0("dc6db", v), combined$fhat5 * x5a[v] - combined$fhat6 * x6a[v]) dc6db<-cbind(dc6db,get(paste0('dc6db',v))) } colnames(dc1db)<-paste0('dc1db',seq(0,k)) colnames(dc2db)<-paste0('dc2db',seq(0,k)) colnames(dc3db)<-paste0('dc3db',seq(0,k)) colnames(dc4db)<-paste0('dc4db',seq(0,k)) colnames(dc5db)<-paste0('dc5db',seq(0,k)) colnames(dc6db)<-paste0('dc6db',seq(0,k)) combined <- combined %>% cbind(dc1db,dc2db,dc3db,dc4db,dc5db,dc6db) # Clean up coefficient/covariance dataset # covar <- cov_matrix[-c(2:(r+1)),-c(2:(r+1))] # Reorder covariance matrix covar <- cov_matrix[c("(Intercept)", newvars), c("(Intercept)", newvars)] # Calculate decomposition estimates and means mean_data <- combined %>% summarize_at(vars(yw, starts_with('pred'), ym, starts_with('dc')), mean, na.rm = T) # Matrix operations for covariance and standard errors V <- as.matrix(covar) # Function for matrix multiplication to calculate standard errors calculate_var <- function(DCDB, V) { DCDB<-DCDB%>% summarize_all(mean, na.rm = T) as.matrix(DCDB) %*% V %*% t(as.matrix(DCDB)) } # Calculate standard errors VAR1 <- calculate_var(dc1db, V) VAR2 <- calculate_var(dc2db, V) VAR3 <- calculate_var(dc3db, V) VAR4 <- calculate_var(dc4db, V) VAR5 <- calculate_var(dc5db, V) VAR6 <- calculate_var(dc6db, V) # Contributions means2<-mean_data%>% cbind(ywdata, ymdata)%>% mutate( cont1 = pred0 - pred1, cont2 = pred1 - pred2, cont3 = pred2 - pred3, cont4 = pred3 - pred4, cont5 = pred4 - pred5, cont6 = pred5 - pred6, gap = ywfull - ymfull, perc1=cont1/gap, perc2=cont2/gap, perc3=cont3/gap, perc4=cont4/gap, perc5=cont5/gap, perc6=cont6/gap, se1=sqrt(VAR1), se2=sqrt(VAR2), se3=sqrt(VAR3), se4=sqrt(VAR4), se5=sqrt(VAR5), se6=sqrt(VAR6) )%>% select(ywfull, ymfull, gap, cont1, se1, perc1, cont2, se2, perc2, cont3, se3, perc3, cont4, se4, perc4, cont5, se5, perc5, cont6, se6, perc6)%>% t()%>%as.data.frame()%>% tibble::rownames_to_column(var = "RowLabels")%>% mutate(ordgroup = as.numeric(str_extract(RowLabels, "\\d+$"))) %>% left_join(rand, by = c("ordgroup"))%>% mutate(RowLabels_new = str_replace(RowLabels, "\\d+$", as.character(i)))%>% mutate(RowLabels = ifelse(RowLabels %in% c("yw", "ym"), RowLabels, RowLabels_new))%>% select(RowLabels, V1) final <- rbind(final,means2) } return(final) } # Run the simulation final <- simulate() print((Sys.time()-t0)) # Summarize simulations and export output to excel meandecomp <- final%>% group_by(RowLabels)%>% summarize_all(mean)%>% rbind(data.frame(RowLabels = 'iterations',V1 = numiterations))%>% left_join(data.frame(Group=labelgroup,RowLabels=paste0('cont',seq(1,6))), by = c('RowLabels'))%>% mutate(Group=ifelse(is.na(Group),"",Group))%>% rename(mean=V1)%>% select(RowLabels,Group,mean) # Reorder meandecomp$RowLabels <- factor(meandecomp$RowLabels, levels = c("ywfull", "ymfull", "gap", paste0(c("cont","se","perc"),rep(seq(1,length(labelgroup)),each = 3)),"iterations")) meandecomp <- meandecomp %>% arrange(RowLabels) # Export meandecomp%>%as.data.frame()%>%xlsx::write.xlsx(file = 'Results/decomp_random_v7_3.xlsx', row.names = F)