# Program 2: Calculate the necessary statistics in each year and sample
# Input:  data/<IPUMS_CPS_Extract.dta>
# Output: output/natdecs.csv
#         output/natpctiles.csv
#         output/raceineq.csv

## Bring in data ##

#Bring in CPI-U-RS
cpirs = read.dta13('data/cpirs.dta')
cpi15 = cpirs[cpirs$year == 2015,'rate']

#Bring in CPS
fulldta = read.dta13(ipumscps)



## Prepare and clean variables ##

#Clean income 
fulldta$hhincome = replace(fulldta$hhincome,list = fulldta$hhincome == 99999999,values = NA )
fulldta$inctot = replace(fulldta$inctot,list = fulldta$inctot == 99999999,values = NA )
fulldta$ftotval = replace(fulldta$ftotval,list = fulldta$ftotval == 999999,values = NA )

#Clean age
fulldta$age = as.numeric(fulldta$age) - 1

#Confirm the range of years
stopifnot(min(fulldta$year) == 1968)
stopifnot(max(fulldta$year) == 2016)

#Races
#Note that hispan is missing prior to 1971. Include those in categories for 1968-1970
stopifnot(fulldta[is.na(fulldta$hispan),'year'] <1971) #hispan is not missing in any year after 1971
stopifnot(is.na(fulldta[fulldta$year < 1971,'hispan'])) #hispan is not missing in any year after 1971
fulldta$white = fulldta$race == 'White' & (fulldta$hispan == 'Not Hispanic' | is.na(fulldta$hispan))

#Main definition of black - non-hispanic, black only 
fulldta$black = fulldta$race == 'Black/Negro' & (fulldta$hispan == 'Not Hispanic' | is.na(fulldta$hispan))

#Expanded definition of black - anyone who reports any black heritage, including hispanics (increases numbers by about 6% across all years) 
fulldta$blackexp = fulldta$race %in% c('Black/Negro','White-Black','Black-American Indian','Black-Asian','Black-Hawaiian/Pacific Islander','White-Black-American Indian','White-Black-Asian','White-Black-American Indian-Asian','Black-American Indian-Asian') 

#Other racial categories
fulldta$asian = fulldta$race %in% c('Asian or Pacific Islander','Asian only','Hawaiian/Pacific Islander only','Asian-Hawaiian/Pacific Islander') & (fulldta$hispan == 'Not Hispanic' | is.na(fulldta$hispan))
fulldta$hisp = !(fulldta$hispan %in% c('Not Hispanic','Do not know','N/A (and no response 1985-87)') )& !is.na(fulldta$hispan)

#Each person is in at most one category
stopifnot(fulldta$white + fulldta$black + fulldta$asian + fulldta$hisp <=1)

#Make flags for people who are native born, and native born to native born parents
#No missings after 1994
stopifnot(!is.na(fulldta$nativity) | fulldta$year < 1994)
fulldta$native = fulldta$nativity %in% c('Both parents native-born','Father foreign, mother native','Mother foreign, father native','Both parents foreign')
fulldta$native_bothpar = fulldta$nativity == 'Both parents native-born'

#Make calculated household and family income using serial and famunit
#And then make size-normalized variables

#Household income
fulldta = fulldta %>% group_by(year, serial) %>% mutate(hhcalc = sum(inctot, na.rm = T), hhsize = length(serial)) %>% data.frame()
fulldta$hhnorm = fulldta$hhcalc / fulldta$hhsize
fulldta$hhsqrt = fulldta$hhcalc / sqrt(fulldta$hhsize)

#Family income
fulldta = fulldta %>% group_by(year,serial,famunit) %>% mutate(famcalc = sum(inctot, na.rm = T), famsizecalc = length(serial)) %>% data.frame()
fulldta$famnorm = fulldta$famcalc / fulldta$famsizecalc
fulldta$famsqrt = fulldta$famcalc / sqrt(fulldta$famsizecalc)

#Confirm that calculated family size matches given family size variable
fulldta$famsizegiven = as.numeric(fulldta$famsize) - 1
stopifnot(fulldta$famsizecalc == fulldta$famsizegiven)



## Make analysis samples ##

#Family - using Census-calculated FTOTVAL to measure family income 
fambase = fulldta #Everyone
fambase_adult = fulldta %>% filter(age >= 18) #Adults
fambase_prime = fulldta %>% filter(age >= 25, age <= 54) #Prime age
fambase_exp = fulldta #Using expanded definition of black
fambase_bw = fulldta %>% filter(black == T | white == T) #Only blacks and whites
fambase_native = fulldta %>% filter(native == T | year < 1994) #Native born
fambase_native_bw = fulldta %>% filter((native == T | year < 1994), (black == T | white == T))
fambase_native_bp = fulldta %>% filter(native_bothpar == T | year < 1994) #Native born to native born parents

#Family sample adjusted for institutionalized population
fambase_inst = fulldta

#Make "other" category in main data
fambase_inst$other = fambase_inst$black == F & fambase_inst$white == F

#Get total population in each group so know how much to multiply percentage by
stopifnot(fambase_inst$black + fambase_inst$white + fambase_inst$other == 1)
totpop = fambase_inst %>% group_by(year, black, white,other) %>% summarise(totpop = sum(asecwt))

#Make one factor from dummies
totpop$race = factor(as.matrix(totpop[,c('black','white','other')]) %*% 1:3, labels = c('black','white','other'))

#Bring in and prepare incarceration rates
inst = read.csv('output/inst_pop.csv')

#Limit to total population, white/black/other
inst = filter(inst,age == 'tot', group %in% c('white','black','other'))
inst$race = inst$group

#Merge on total population
inst = merge(inst, totpop, by = c('year','race'))

#Get total institutionalized population by race and year
#Note have to compute the true total pop by dividing the total pop in the CPS by 
#one minus the incarceration rate. Then subtract the observed CPS population to get incarcerated population
inst$instpop = inst$totpop / (1-inst$inst) - inst$totpop 

#Now, go through each year, find the first person in each racial group with zero income, 
#and add the institutionalized population to their survey weight
for(yr in 1968: 2016){
  for(grp in c('white','black','other')){
    #Incarcerated population
    instpop = filter(inst, group == grp, year == yr) %>% select(instpop) %>% unlist()
    
    #Find someone in group and year with zero income
    stopifnot(length(which(fambase_inst[,grp] == T & fambase_inst$year == yr & fambase_inst$ftotval == 0)) > 0)
    cand = which(fambase_inst[,grp] == T & fambase_inst$year == yr & fambase_inst$ftotval == 0)[1]
    
    #Add instpop to their weight
    fambase_inst[cand,'asecwt'] =  fambase_inst[cand,'asecwt'] + instpop
  }
  stopifnot(abs(sum(filter(fambase_inst,year == yr)$asecwt) - sum(filter(fambase, year == yr)$asecwt) - sum(filter(inst, year == yr)$instpop)) < 5)
}

#Do black white only institutionalized-adjusted sample as well
fambase_inst_bw = fambase_inst %>% filter(black == T | white == T) 

#Family - calculated
famcalc = fulldta
famcalc_adult = fulldta %>% filter(age >= 18)
famcalc_exp = fulldta
famcalc_bw = fulldta %>% filter(black == T | white == T)

#Family - normalized
famnorm = fulldta
famnorm_adult = fulldta %>% filter(age >= 18)
famnorm_exp = fulldta
famnorm_bw = fulldta %>% filter(black == T | white == T)

#Family - square root
famsqrt = fulldta
famsqrt_adult = fulldta %>% filter(age >= 18)
famsqrt_exp = fulldta
famsqrt_bw = fulldta %>% filter(black == T | white == T)

#Household - given 
hhbase = fulldta
hhbase_adult = fulldta %>% filter(age >= 18)
hhbase_exp = fulldta
hhbase_bw = fulldta %>% filter(black == T | white == T)

#Household - calculated
hhcalc = fulldta
hhcalc_adult = fulldta %>% filter(age >= 18)
hhcalc_exp = fulldta
hhcalc_bw = fulldta %>% filter(black == T | white == T)

#Household - normalized
hhnorm = fulldta
hhnorm_adult = fulldta %>% filter(age >= 18)
hhnorm_exp = fulldta
hhnorm_bw = fulldta %>% filter(black == T | white == T)

#Household - square root
hhsqrt = fulldta
hhsqrt_adult = fulldta %>% filter(age >= 18)
hhsqrt_exp = fulldta
hhsqrt_bw = fulldta %>% filter(black == T | white == T)

#Full list of samples for looping
samps = list(fambase,fambase_adult,fambase_prime,fambase_bw,fambase_exp,fambase_native,fambase_native_bw,fambase_native_bp,fambase_inst,fambase_inst_bw,
             famcalc,famcalc_adult,famcalc_bw,famcalc_exp,
             famnorm,famnorm_adult,famnorm_bw,famnorm_exp,
             famsqrt,famsqrt_adult,famsqrt_bw,famsqrt_exp,
             hhbase,hhbase_adult,hhbase_bw,hhbase_exp,
             hhcalc,hhcalc_adult,hhcalc_bw,hhcalc_exp,
             hhnorm,hhnorm_adult,hhnorm_bw,hhnorm_exp,
             hhsqrt,hhsqrt_adult,hhsqrt_bw,hhsqrt_exp)
snames = c('fambase','fambase_adult','fambase_prime','fambase_bw','fambase_exp','fambase_native','fambase_native_bw','fambase_native_bp','fambase_inst','fambase_inst_bw',
           'famcalc','famcalc_adult','famcalc_bw','famcalc_exp',
           'famnorm','famnorm_adult','famnorm_bw','famnorm_exp',
           'famsqrt','famsqrt_adult','famsqrt_bw','famsqrt_exp',
           'hhbase','hhbase_adult','hhbase_bw','hhbase_exp',
           'hhcalc','hhcalc_adult','hhcalc_bw','hhcalc_exp',
           'hhnorm','hhnorm_adult','hhnorm_bw','hhnorm_exp',
           'hhsqrt','hhsqrt_adult','hhsqrt_bw','hhsqrt_exp')
tnames = c('Census family income','Census family income, adults only','Census family income, adults age 25-54 only','Census family income, blacks/whites only','Census family income, any black ancestry','Census family income, native-born citizens only','Census family income, native-born citizens only, blacks/whites only',
           'Census family income, native-born citizens of native-born parents only','Census family income, adjusting for\ninstitutionalized population','Census family income, adjusting for\ninstitutionalized population, blacks/whites only',
           'calculated family income','calculated family income, adults only','calculated family income, blacks/whites only','calculated family income, any black ancestry',
           'normalized family income','normalized family income, adults only','normalized family income, blacks/whites only','normalized family income, any black ancestry',
           'sqrt-normalized family income','sqrt-normalized family income, adults only','sqrt-normalized family income, blacks/whites only','sqrt-normalized family income, any black ancestry',
           'Census household income','Census household income, adults only','Census household income, blacks/whites only','Census household income, any black ancestry',
           'calculated household income','calculated household income, adults only','calculated household income, blacks/whites only','calculated household income, any black ancestry',
           'normalized household income','normalized household income, adults only','normalized household income, blacks/whites only','normalized household income, any black ancestry',
           'sqrt-normalized household income','sqrt-normalized household income, adults only','sqrt-normalized household income, blacks/whites only','sqrt-normalized household income, any black ancestry')



## Conduct Analysis ##

#Creat three output data frames
outp = NULL
natpctiles = NULL
natdecs = NULL

#Loop through each sample
for(s in 1:length(snames)) {
  samp = samps[[s]]
  sname = snames[s]
  tname = tnames[s]
  print(sname)
  
  #Assign target race variable - either only black or any black ancestry
  if(length(grep('_exp',sname)) == 0) samp$target = samp$black
  if(length(grep('_exp',sname)) != 0) samp$target = samp$blackexp
  samp$base = samp$white
  
  #Assign income variable
  if(length(grep('fambase',sname)) != 0) samp$rawinc = samp$ftotval
  if(length(grep('famcalc',sname)) != 0) samp$rawinc = samp$famcalc
  if(length(grep('famnorm',sname)) != 0) samp$rawinc = samp$famnorm
  if(length(grep('famsqrt',sname)) != 0) samp$rawinc = samp$famsqrt
  if(length(grep('hhbase',sname)) != 0) samp$rawinc = samp$hhincome
  if(length(grep('hhcalc',sname)) != 0) samp$rawinc = samp$hhcalc
  if(length(grep('hhnorm',sname)) != 0) samp$rawinc = samp$hhnorm
  if(length(grep('hhsqrt',sname)) != 0) samp$rawinc = samp$hhsqrt

  #Loop through years and construct national and racial distributions
  
  for(yr in 1968:2016){ #This takes a long time; almost all of it is in computing the weighted percentiles
    
    print(paste(sname, yr))
    dta = filter(samp,year == yr)
    
    #Inflate income to 2015 dollars
    cpiyr = filter(cpirs, year == yr - 1) %>% select(rate) %>% unlist() #Use CPI from year before since that's what they're reporting
    dta$inc = dta$rawinc * cpi15 / cpiyr
    
    #Drop if incomes are missing or negative
    dta = filter(dta, !is.na(inc))
    dta = filter(dta, inc >= 0)
  
    #Compute percentile cutoff values
    natpcts = wtd.quantile(dta$inc,dta$asecwt,probs = 1:100/100) 
    
    #Compute national mean
    natmean = wtd.mean(dta$inc,dta$asecwt)
    
    #Store for later
    natpctiles = rbind(natpctiles,c(sname,yr,natmean,natpcts))
    
    #Assign percentiles to individual observations
    natpcts = natpcts %>% data.frame()
    names(natpcts) = 'topcut'
    natpcts$pct = sub('%','',row.names(natpcts)) %>% as.numeric()
    
    #If more than 1% of obs have the same value, assign all obs with that value to the median percentile
    pctnames = natpcts %>% group_by(topcut) %>% summarise(pctile = median(pct)) %>% data.frame()
    
    #Make vector of cut points for assigning individuals to percentiles based on income
    sampcuts = c(-1, pctnames$topcut + .001)
    sampnames = pctnames$pctile
    
    #Assign people to national percentiles
    dta$pctile = cut(dta$inc,breaks = sampcuts,labels = sampnames)
    stopifnot(!is.na(dta$pctile))
    dta$pctile = as.numeric(as.character(dta$pctile))
    
    #Separately compute and assign to national deciles. Here no category should cover more than 10% of observations
    natdec = wtd.quantile(dta$inc, dta$asecwt, 1:10 / 10)
    stopifnot(length(natdec) == length(unique(natdec)))
    dta$dec = cut(dta$inc, breaks = c(-.1,natdec+.001),labels = 1:10)
    stopifnot(!is.na(dta$dec))
    
    #Get overall percent of country in each decile
    totpop = sum(dta$asecwt)
    whitepop = sum(filter(dta, base == T)$asecwt)
    blackpop = sum(filter(dta, target == T)$asecwt)
    totdecs = dta %>% group_by(dec) %>% summarise(pop = sum(asecwt)/totpop) %>% data.frame()
    whitedecs =  dta %>% filter(base == T) %>% group_by(dec) %>% summarise(pop = sum(asecwt)/whitepop) %>% data.frame()
    blackdecs =  dta %>% filter(target == T) %>% group_by(dec) %>% summarise(pop = sum(asecwt)/blackpop) %>% data.frame()
    names(blackdecs) = c('dec','pop_black')
    
    decs = merge(totdecs,whitedecs, by = 'dec', suffixes = c('_tot','_white'))
    decs = merge(decs,blackdecs, by = 'dec')
    decs$year = yr
    decs$samp = sname
    natdecs = rbind(natdecs,decs)
    
    #Now that we've computed percentiles for the whole nation, keep only blacks and whites
    dta = filter(dta, target == 1 | base == 1)
    
    #Compute income and rank at each decile of black and white distributions, plus mean
    cstats = dta %>% group_by(target) %>% summarise(
      pct1 = wtd.quantile(inc,asecwt,.1),
      pct2 = wtd.quantile(inc,asecwt,.2),
      pct3 = wtd.quantile(inc,asecwt,.3),
      pct4 = wtd.quantile(inc,asecwt,.4),
      pct5 = wtd.quantile(inc,asecwt,.5),
      pct6 = wtd.quantile(inc,asecwt,.6),
      pct7 = wtd.quantile(inc,asecwt,.7),
      pct8 = wtd.quantile(inc,asecwt,.8),
      pct9 = wtd.quantile(inc,asecwt,.9),
      mean = wtd.mean(inc,asecwt),
      rpct1 = wtd.quantile(pctile,asecwt,.1),
      rpct2 = wtd.quantile(pctile,asecwt,.2),
      rpct3 = wtd.quantile(pctile,asecwt,.3),
      rpct4 = wtd.quantile(pctile,asecwt,.4),
      rpct5 = wtd.quantile(pctile,asecwt,.5),
      rpct6 = wtd.quantile(pctile,asecwt,.6),
      rpct7 = wtd.quantile(pctile,asecwt,.7),
      rpct8 = wtd.quantile(pctile,asecwt,.8),
      rpct9 = wtd.quantile(pctile,asecwt,.9),
      rmean = wtd.mean(pctile,asecwt)
    ) %>% data.frame()

    #Reshape to get one row, labeled black and white
    cstats$race = factor(cstats$target, levels = c(T,F), labels = c('black','white'))
    cstats = select(cstats, -target)
    cstat = melt(cstats,id.vars = 'race')
    cst = dcast(cstat, . ~ race + variable, value.var = 'value')
    
    #Compute income ratios and rank differences
    for(v in c(paste('pct',1:9,sep = ''),'mean')){
      cst[,paste('rat_',v,sep = '')] = cst[,paste('black_',v,sep = '')] / cst[,paste('white_',v,sep = '')]
      cst[,paste('rdif_',v,sep = '')] = cst[,paste('white_r',v,sep = '')] - cst[,paste('black_r',v,sep = '')]  
    }
    cst = cst %>% select(-.) 
    dfnames = names(cst)
    
    #Save to output dataframe
    cst = c(sname,yr,unlist(cst))
    outp = rbind(outp, cst)
    
  } #year
  
  #Write results after each sample
  outdf = data.frame(outp)
  names(outdf) = c('samp','yr',dfnames)
  write.csv(outdf,paste('output/raceineq.csv',sep = ''),row.names = F)
  
  natpctilesdf = data.frame(natpctiles)
  names(natpctilesdf) = c('samp','yr','natmean',paste('pct',1:100,sep = ''))
  write.csv(natpctilesdf,paste('output/natpctiles.csv',sep = ''),row.names = F)
  
  write.csv(natdecs,paste('output/natdecs.csv',sep = ''),row.names = F)
  
} #sample

    