###########################################################################
# Non-Linear Decomposition Technique for Logit or Probit Model            #
# Randomized Ordering of Variables                                        #
# Originally developed in: Fairlie, Robert. 1999. "The Absence of the     #
#   African-American Owned Business: An Analysis of the Dynamics of Self- #
#   Employment," Journal of Labor Economics, 17(1): 80-108                #
# Latest Revision: Fairlie, Robert. 2017. "Addressing Path Dependence and #
#   Incorporating Sample Weights in the Nonlinear Blinder-Oaxaca          #
#   Decomposition Technique for Logit, Probit and Other Nonlinear Models" #
#   Stanford Institute for Economic Policy Research, Working Paper 17-013 #
# Translated to R by Beatriz Rache and RF in May 2025                     #
###########################################################################

# NOTES:
# Currently set to randomize ordering of groups of variables, see NOTE:ORDERING to change
# Currently set for use with sample weights, see NOTE:WEIGHTS to change
# Currently set for pooled (all races) coefficient estimates, see NOTE:POOLED to change
# Currently set for 100 iterations, see NOTE:ITERATIONS to change
# Currently set for Logit model, see NOTE:PROBIT to change

# Example data set can be downloaded at http://people.ucsc.edu/~rfairlie/decomposition/
# Set directory
rm(list=ls())
# NEEDS TO BE CHANGED 
setwd('C:/Users/rfairlie/temp/decomp/r2024')
t0<-Sys.time()

# Set seed
set.seed(2024)

# Load packages
require(dplyr)
require(readxl)
require(xlsx)
require(stats)
require(stringr)

# Specify number of iterations
# NOTE:ITERATIONS change this to 1000 or higher for final run, but test with fewer
numiterations <- 10

# NOTE:POOLED define race variables to be included only in pooled logits
r <- 4
racevars <- c("black", "latino", "natamer", "asian")

# Specify total number of independent variables, not including race dummies above
k <- 31

# Define categories of variables for decomposition
group1 <- c("female", "age")
group2 <- c("married", "prevmar", "children", "chld617")
group3 <- c("hsgrad", "somecol", "college", "gradsch")
group4 <- c("inc1015", "inc1520", "inc2025", "inc2530", "inc3035", "inc3540",
            "inc4050", "inc5060", "inc6075", "incgt75")
group5 <- c("midatlan", "encent", "wncent", "satlan", "escent", "wscent", "mountain", "pacific")
group6 <- c("notcc", "notmsa", "notid")

# Define short labels for each group of variables for final table output
labelgroup <- c("Gender/Age", "Family", "Education", "Income", "Region", "City")

# Rename minority group variables
mgroup1 <- c("mfemale", "mage")
mgroup2 <- c("mmarried", "mprevmar", "mchildren", "mchld617")
mgroup3 <- c("mhsgrad", "msomecol", "mcollege", "mgradsch")
mgroup4 <- c("minc1015", "minc1520", "minc2025", "minc2530", "minc3035", "minc3540",
             "minc4050", "minc5060", "minc6075", "mincgt75")
mgroup5 <- c("mmidatlan", "mencent", "mwncent", "msatlan", "mescent", "mwscent", "mmountain", "mpacific")
mgroup6 <- c("mnotcc", "mnotmsa", "mnotid")

# Rename reference group (white) variables
wgroup1 <- c("wfemale", "wage")
wgroup2 <- c("wmarried", "wprevmar", "wchildren", "wchld617")
wgroup3 <- c("whsgrad", "wsomecol", "wcollege", "wgradsch")
wgroup4 <- c("winc1015", "winc1520", "winc2025", "winc2530", "winc3035", "winc3540",
             "winc4050", "winc5060", "winc6075", "wincgt75")
wgroup5 <- c("wmidatlan", "wencent", "wwncent", "wsatlan", "wescent", "wwscent", "wmountain", "wpacific")
wgroup6 <- c("wnotcc", "wnotmsa", "wnotid")

# Combine groups
vars <- c(group1, group2, group3, group4, group5, group6)
mvars <- c(mgroup1, mgroup2, mgroup3, mgroup4, mgroup5, mgroup6)
wvars <- c(wgroup1, wgroup2, wgroup3, wgroup4, wgroup5, wgroup6)

# Prepare original data for program
# NEEDS TO BE CHANGED
temp <- read.csv("C:/Users/rfairlie/temp/decomp/finaldecomp00.csv")%>%

  # Define dependent variable
  mutate(y = homecomp)%>%
  
  # Delete observations with any missing values for dependent or independent variables
  filter(!is.na(y) & !is.na(hsgrad) & !is.na(inc1015))%>%
  
  # NOTE:WEIGHTS, if no sample weights in data then set wgt to 1
  # mutate(wgt = 1)%>%
  
  # Delete observations with missing, zero or negative sample weights  
  filter(!is.na(wgt) & !wgt <= 0)%>%
  
  # Define sample - e.g. only keep working-age adults for this run
  filter(age >= 25 & age <= 55)%>%
  mutate(wgt = wgt/mean(wgt))

# Define subset of data for estimating coefficients
# NOTE:POOLED, Currently set to use pooled sample (all racial groups) to estimate coefficents
# Could be changed to use sample for only one group (e.g. white or black coefficients)
regdata <- temp%>%
  mutate(y = as.integer(y))
  # filter(white==1)
  # filter(black==1)

# create minority sample with minority variable names
# Define which minority group is used
minority <- temp%>%
  filter(black == 1)%>%
  select(ym = y, wgtm = wgt, all_of(vars))

# Create full reference group (white) sample with reference group (white) variable names
white <- temp%>%
  filter(white == 1)%>%
  select(yw = y, wgtw = wgt, all_of(vars))

# Print out full sample means
minority_means <- minority %>%
  summarise(across(everything(), ~ weighted.mean(.x, wgtm, na.rm = TRUE)))
white_means <- white %>%
  summarise(across(everything(), ~ weighted.mean(.x, wgtw, na.rm = TRUE)))

# Calculate means of dependent variables for full sample
# These values are used to calculate the total gap in the decomposition
ymdata <- minority %>%
  summarise(ymfull = weighted.mean(ym, wgtm, na.rm = TRUE), mn = n())
ywdata <- white %>%
  summarise(ywfull = weighted.mean(yw, wgtw, na.rm = TRUE), wn = n())

# define global variables for full white and full minority sample sizes
mobs <- ymdata$mn
wobs <- ywdata$wn

# Estimate logit model to obtain coefficients
# NOTE:POOLED set for pooled or specific group sample above
formula <- paste0('y ~ ',paste(c(racevars,vars), collapse = '+'))
logit_model <- suppressWarnings(glm(as.formula(formula), family = binomial(link = "logit"), data = regdata, weights = wgt))
orgcoefs <- coef(logit_model)
cov_matrix <- vcov(logit_model)

# Remove race dummies from coefficient dataset
# NOTE:POOLED only need this for pooled estimates
coefs <- orgcoefs[!names(orgcoefs)%in%racevars]

# Calculate predicted probabilities for both samples
# NOTE:PROBIT, need to change functional form here and below
white <- white %>%
  mutate(xbeta = as.numeric(cbind(1, as.matrix(select(., all_of(vars)))) %*% as.numeric(coefs)),
         wordprob = exp(xbeta) / (1 + exp(xbeta)))

# NOTE:PROBIT use normal distribution for probit 
minority <- minority %>%
  mutate(xbeta = as.numeric(cbind(1, as.matrix(select(., all_of(vars)))) %*% as.numeric(coefs)),
         mordprob = exp(xbeta) / (1 + exp(xbeta)))

# Create empty starting dataset for iterations
final <- data.frame()
means2 <- data.frame()

# Define macro for iterations
simulate <- function() {
  for (it in 1:numiterations) {
    rename_vector_w <- setNames(wvars,vars)
    white1 <- white %>% sample_n(mobs, replace = T, weight = wgtw)
    white1 <- white1 %>% mutate(random1 = runif(n()))
    white2 <- white1 %>% arrange(random1) %>%
      select(-c(xbeta))%>%
      rename_at(vars(names(rename_vector_w)),~rename_vector_w)
    rename_vector_m <- setNames(mvars,vars)
    
    minority1 <- minority %>%
      sample_n(mobs, replace = FALSE, weight = wgtm) %>%
      select(-c(xbeta))%>%
      rename_at(vars(names(rename_vector_m)),~rename_vector_m)
    
    # Merge datasets together for matching, random matching;
    combined <- white2 %>% cbind(minority1)
    
    # NOTE:ORDERING
    # Randomly assign variable groups ("defined groups") to "number groups" for random ordering of variable groups;
    rand <- tibble(i = 1:6, r = runif(6))%>%
      mutate(ordgroup = ntile(r, 6)) # for random order
      # mutate(ordgroup = i) # change to original order
      # mutate(ordgroup = 7 - i)  # change to reverse order

    rand_order <- sample(1:6)
    
    # Define new variable groups based on random ordering
    new_group1 <- get(paste0('group',rand[rand$ordgroup == 1,'i']))
    new_group2 <- get(paste0('group',rand[rand$ordgroup == 2,'i']))
    new_group3 <- get(paste0('group',rand[rand$ordgroup == 3,'i']))
    new_group4 <- get(paste0('group',rand[rand$ordgroup == 4,'i']))
    new_group5 <- get(paste0('group',rand[rand$ordgroup == 5,'i']))
    new_group6 <- get(paste0('group',rand[rand$ordgroup == 6,'i']))

    w_new_group1<-paste0('w',new_group1)
    w_new_group2<-paste0('w',new_group2)
    w_new_group3<-paste0('w',new_group3)
    w_new_group4<-paste0('w',new_group4)
    w_new_group5<-paste0('w',new_group5)
    w_new_group6<-paste0('w',new_group6)

    m_new_group1<-paste0('m',new_group1)
    m_new_group2<-paste0('m',new_group2)
    m_new_group3<-paste0('m',new_group3)
    m_new_group4<-paste0('m',new_group4)
    m_new_group5<-paste0('m',new_group5)
    m_new_group6<-paste0('m',new_group6)
    
    # Assign new variable groups in a new order
    newvars <- c(new_group1, new_group2, new_group3, new_group4, new_group5, new_group6)
    
    # Create a named list for new coefficient assignment
    newcoefsa <- setNames(numeric(k+1), c("(Intercept)",newvars))
    
    # Fill in with coefficient estimates
    newcoefsa[names(newcoefsa) %in% names(coefs)] <- coefs[names(newcoefsa)]
    
    # Define distribution switches
    x0a = combined %>% select(all_of(w_new_group1), all_of(w_new_group2), all_of(w_new_group3),
                              all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6))

    x1a = combined %>% select(all_of(m_new_group1), all_of(w_new_group2), all_of(w_new_group3),
                              all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6))

    x2a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(w_new_group3),
                              all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6))

    x3a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3),
                              all_of(w_new_group4), all_of(w_new_group5), all_of(w_new_group6))

    x4a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3),
                              all_of(m_new_group4), all_of(w_new_group5), all_of(w_new_group6))

    x5a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3),
                              all_of(m_new_group4), all_of(m_new_group5), all_of(w_new_group6))

    x6a = combined %>% select(all_of(m_new_group1), all_of(m_new_group2), all_of(m_new_group3),
                              all_of(m_new_group4), all_of(m_new_group5), all_of(m_new_group6))
    
    # Perform white to black variable distribution switches
    combined <- combined %>%
      mutate(xb0 = cbind(1, as.matrix(x0a)) %*% as.numeric(newcoefsa),
             xb1 = cbind(1, as.matrix(x1a)) %*% as.numeric(newcoefsa),
             xb2 = cbind(1, as.matrix(x2a)) %*% as.numeric(newcoefsa),
             xb3 = cbind(1, as.matrix(x3a)) %*% as.numeric(newcoefsa),
             xb4 = cbind(1, as.matrix(x4a)) %*% as.numeric(newcoefsa),
             xb5 = cbind(1, as.matrix(x5a)) %*% as.numeric(newcoefsa),
             xb6 = cbind(1, as.matrix(x6a)) %*% as.numeric(newcoefsa))

    # Calculate various predicted probabilities
    combined <- combined %>%
      mutate(
        pred0 = exp(xb0) / (1 + exp(xb0)),
        pred1 = exp(xb1) / (1 + exp(xb1)),
        pred2 = exp(xb2) / (1 + exp(xb2)),
        pred3 = exp(xb3) / (1 + exp(xb3)),
        pred4 = exp(xb4) / (1 + exp(xb4)),
        pred5 = exp(xb5) / (1 + exp(xb5)),
        pred6 = exp(xb6) / (1 + exp(xb6))
      )
    
    # Calculate various pdfs for standard error calculations
    # NOTE:PROBIT update for normal distribution
    combined <- combined %>%
      mutate(
        fhat0 = pred0 * (1 - pred0),
        fhat1 = pred1 * (1 - pred1),
        fhat2 = pred2 * (1 - pred2),
        fhat3 = pred3 * (1 - pred3),
        fhat4 = pred4 * (1 - pred4),
        fhat5 = pred5 * (1 - pred5),
        fhat6 = pred6 * (1 - pred6)
      )
    
    # Create intercept component to derivatives
    dc1db0 = combined$fhat0 - combined$fhat1
    dc2db0 = combined$fhat1 - combined$fhat2
    dc3db0 = combined$fhat2 - combined$fhat3
    dc4db0 = combined$fhat3 - combined$fhat4
    dc5db0 = combined$fhat4 - combined$fhat5
    dc6db0 = combined$fhat5 - combined$fhat6
    
    dc1db<-data.frame(dc1db0)    
    dc2db<-data.frame(dc2db0)
    dc3db<-data.frame(dc3db0)
    dc4db<-data.frame(dc4db0)
    dc5db<-data.frame(dc5db0)
    dc6db<-data.frame(dc6db0)
    for (v in 1:k) {
      
      assign(paste0("dc1db", v), combined$fhat0 * x0a[v] - combined$fhat1 * x1a[v])
      dc1db<-cbind(dc1db,get(paste0('dc1db',v)))
      assign(paste0("dc2db", v), combined$fhat1 * x1a[v] - combined$fhat2 * x2a[v])
      dc2db<-cbind(dc2db,get(paste0('dc2db',v)))
      assign(paste0("dc3db", v), combined$fhat2 * x2a[v] - combined$fhat3 * x3a[v])
      dc3db<-cbind(dc3db,get(paste0('dc3db',v)))
      assign(paste0("dc4db", v), combined$fhat3 * x3a[v] - combined$fhat4 * x4a[v])
      dc4db<-cbind(dc4db,get(paste0('dc4db',v)))
      assign(paste0("dc5db", v), combined$fhat4 * x4a[v] - combined$fhat5 * x5a[v])
      dc5db<-cbind(dc5db,get(paste0('dc5db',v)))
      assign(paste0("dc6db", v), combined$fhat5 * x5a[v] - combined$fhat6 * x6a[v])
      dc6db<-cbind(dc6db,get(paste0('dc6db',v)))
    }
    
    colnames(dc1db)<-paste0('dc1db',seq(0,k))
    colnames(dc2db)<-paste0('dc2db',seq(0,k))
    colnames(dc3db)<-paste0('dc3db',seq(0,k))
    colnames(dc4db)<-paste0('dc4db',seq(0,k))
    colnames(dc5db)<-paste0('dc5db',seq(0,k))
    colnames(dc6db)<-paste0('dc6db',seq(0,k))
    
    combined <- combined %>%
      cbind(dc1db,dc2db,dc3db,dc4db,dc5db,dc6db)
  
    # Clean up coefficient/covariance dataset
    # covar <- cov_matrix[-c(2:(r+1)),-c(2:(r+1))]
    
    # Reorder covariance matrix
    covar <- cov_matrix[c("(Intercept)", newvars), c("(Intercept)", newvars)]
    
    # Calculate decomposition estimates and means
    mean_data <- combined %>%
      summarize_at(vars(yw,
                        starts_with('pred'),
                        ym,
                        starts_with('dc')), mean, na.rm = T)
    
    # Matrix operations for covariance and standard errors
    V <- as.matrix(covar)
    
    # Function for matrix multiplication to calculate standard errors
    calculate_var <- function(DCDB, V) {
      DCDB<-DCDB%>%
        summarize_all(mean, na.rm = T)
      as.matrix(DCDB) %*% V %*% t(as.matrix(DCDB))
    }
    
    # Calculate standard errors
    VAR1 <- calculate_var(dc1db, V)
    VAR2 <- calculate_var(dc2db, V)
    VAR3 <- calculate_var(dc3db, V)
    VAR4 <- calculate_var(dc4db, V)
    VAR5 <- calculate_var(dc5db, V)
    VAR6 <- calculate_var(dc6db, V)
    
    # Contributions
    means2<-mean_data%>%
      cbind(ywdata, ymdata)%>%
      mutate(
        cont1 = pred0 - pred1,
        cont2 = pred1 - pred2,
        cont3 = pred2 - pred3,
        cont4 = pred3 - pred4,
        cont5 = pred4 - pred5,
        cont6 = pred5 - pred6,
        gap = ywfull - ymfull,
        perc1=cont1/gap,
        perc2=cont2/gap,
        perc3=cont3/gap,
        perc4=cont4/gap,
        perc5=cont5/gap,
        perc6=cont6/gap,
        se1=sqrt(VAR1),
        se2=sqrt(VAR2),
        se3=sqrt(VAR3),
        se4=sqrt(VAR4),
        se5=sqrt(VAR5),
        se6=sqrt(VAR6)
        )%>%
        select(ywfull, ymfull, gap,
             cont1, se1, perc1, cont2, se2, perc2, cont3, se3, perc3, cont4, se4, perc4, 
             cont5, se5, perc5, cont6, se6, perc6)%>%
      t()%>%as.data.frame()%>%
      tibble::rownames_to_column(var = "RowLabels")%>%
      mutate(ordgroup = as.numeric(str_extract(RowLabels, "\\d+$"))) %>%
      left_join(rand, by = c("ordgroup"))%>%
      mutate(RowLabels_new = str_replace(RowLabels, "\\d+$", as.character(i)))%>%
      mutate(RowLabels = ifelse(RowLabels %in% c("yw", "ym"), RowLabels, RowLabels_new))%>%
      select(RowLabels, V1)
    final <- rbind(final,means2)
  }
  
  return(final)
  
}

# Run the simulation
final <- simulate()
print((Sys.time()-t0)) 

# Summarize simulations and export output to excel
meandecomp <- final%>%
  group_by(RowLabels)%>%
  summarize_all(mean)%>%
  rbind(data.frame(RowLabels = 'iterations',V1 = numiterations))%>%
  left_join(data.frame(Group=labelgroup,RowLabels=paste0('cont',seq(1,6))),
            by = c('RowLabels'))%>%
  mutate(Group=ifelse(is.na(Group),"",Group))%>%
  rename(mean=V1)%>%
  select(RowLabels,Group,mean)

# Reorder
meandecomp$RowLabels <- factor(meandecomp$RowLabels, levels = c("ywfull", "ymfull", "gap", paste0(c("cont","se","perc"),rep(seq(1,length(labelgroup)),each = 3)),"iterations"))
meandecomp <- meandecomp %>% arrange(RowLabels)

# Export
meandecomp%>%as.data.frame()%>%xlsx::write.xlsx(file = 'Results/decomp_random_v7_3.xlsx', row.names = F)