#Program 1: Calculate the yearly institutionalization rate by race
#Input: data/<IPUMS_USA_Extract.dta>
#Output: output/inst_pop.csv

#Bring in Census data
fulldta = read.dta13(ipumsusa)

#Clean age variable
fulldta$age = as.numeric(fulldta$age) - 1

#Make race flags - note that in the Census Hispanics are identified in all years
fulldta$white = fulldta$race == 'White' & (fulldta$hispan == 'Not Hispanic')

#Main definition of black - non-hispanic, black only 
fulldta$black = fulldta$race == 'Black/African American/Negro' & (fulldta$hispan == 'Not Hispanic')

#Asians and Latinos and Native Americans and Other/2+
fulldta$asian = fulldta$race %in% c('Chinese','Japanese','Other Asian or Pacific Islander') & (fulldta$hispan == 'Not Hispanic')
fulldta$hisp = fulldta$hispan %in% c('Mexican','Puerto Rican','Cuban','Other')
fulldta$native = fulldta$race == 'American Indian or Alaska Native' & (fulldta$hispan == 'Not Hispanic')
fulldta$other = fulldta$race %in% c('Other race, nec','Two major races','Three or more major races') & (fulldta$hispan == 'Not Hispanic')

#Each person is in at most one category
stopifnot(fulldta$white + fulldta$black + fulldta$asian + fulldta$hisp + fulldta$native + fulldta$other ==1)
fulldta$raceclean = ''
fulldta[fulldta$white == T,'raceclean'] = 'white'
fulldta[fulldta$black == T,'raceclean'] = 'black'
fulldta[fulldta$hisp == T,'raceclean'] = 'hisp'
fulldta[fulldta$asian == T,'raceclean'] = 'asian'
fulldta[fulldta$native == T,'raceclean'] = 'native'
fulldta[fulldta$other == T,'raceclean'] = 'other'

#Double check the distributions
table(fulldta$raceclean, useNA = 'always')

#Make broad race variable with just white/black/other
fulldta$race_broad = fulldta$raceclean
fulldta[fulldta$race_broad %in% c('hisp','asian','native','other'),'race_broad'] = 'other'
table(fulldta$race_broad, useNA = 'always')

#Make flag for adults
fulldta$adult = fulldta$age >= 18
fulldta$allage = T

#Make flag for institutionalized population - Correctional institution, mental institution, home for elderly, other institutions
fulldta$institution = fulldta$gqtyped %in% c(100,200,300,410,411,420)
table(fulldta$institution, useNA = 'always')
  
  #From IPUMS .do file:
  # 100 `"Institution (1990, 2000, ACS/PRCS)"'
  # 200 `"Correctional institution"'
  # 300 `"Mental institutions"'
  # 410 `"Homes for elderly"'
  # 411 `"Aged, dependent home"'
  # 420 `"Other Instits (Not Aged)"'

#Make weight variables for total inst pop, adult pop, adult inst pop
fulldta$instpop = fulldta$institution * fulldta$perwt
fulldta$instpop_adult = fulldta$institution * fulldta$perwt * fulldta$adult
fulldta$adultpop = fulldta$adult * fulldta$perwt

#Calculate total and incarcerated populations by Census year and race
totpops = fulldta %>% group_by(year, race_broad) %>% summarise(tot_pop = sum(perwt), 
                            tot_inst = sum(instpop), 
                            adult_pop = sum(adultpop), 
                            adult_inst = sum(instpop_adult)) %>% data.frame()

#Calculate total and incarcerated populations by Census year for "all" category
yrtots = totpops %>% group_by(year) %>% summarise(tot_pop = sum(tot_pop), 
                          tot_inst = sum(tot_inst), 
                          adult_pop = sum(adult_pop), 
                          adult_inst = sum(adult_inst)) %>% data.frame()

#Make clean race dataset
totpops$group = totpops$race_broad
totpops = select(totpops,group,year,tot_pop,tot_inst,adult_pop,adult_inst)

#Make clean all races dataset
yrtots$group = 'all'
totpops = rbind(totpops,yrtots)
totpops$tot_pct = totpops$tot_inst / totpops$tot_pop
totpops$adult_pct = totpops$adult_inst / totpops$adult_pop

#Now linearly interpolate between Census years

#Make output dataframe
output = NULL

#Loop through race and age categories
for(grp in c('white','black','other','all')){ 
  for(age in c('tot','adult')){
    
    #First do decades
    for(yr in c(1960,1970,1980,1990)) {
      curyear = filter(totpops,year == yr,group == grp)[,paste(age,'_pct',sep = '')]
      nextyear = filter(totpops,year == yr + 10,group == grp)[,paste(age,'_pct',sep = '')]
      
      #Add to output dataframe
      outline = c(grp,age,yr,curyear)
      output = rbind(output,outline)
      
      #Interpolate between curyear and nextyear
      for(i in 1:9){
        iyear = yr + i
        ival = curyear + i * (nextyear - curyear) / 10
        outline = c(grp,age,iyear,ival)
        output = rbind(output,outline)
      }
    }

    #Now do 2001-2005
    curyear = filter(totpops,year == 2000,group == grp)[,paste(age,'_pct',sep = '')]
    nextyear = filter(totpops,year == 2006,group == grp)[,paste(age,'_pct',sep = '')]
    outline = c(grp,age,2000,curyear)
    output = rbind(output,outline)
    
    for(i in 1:5){
      iyear = 2000 + i
      ival = curyear + i * (nextyear - curyear) / 6
      outline = c(grp,age,iyear,ival)
      output = rbind(output,outline)
    }
    
    for(yr in 2006:2016){
      curyear = filter(totpops,year == yr,group == grp)[,paste(age,'_pct',sep = '')]
      outline = c(grp,age,yr,curyear)
      output = rbind(output,outline)
    }
  }
}

#Output clean data frame
output = data.frame(output)
names(output) = c('group','age','year','inst')
write.csv(output,'output/inst_pop.csv',row.names = F)

# test = filter(output, age == 'tot', group == 'all')
# test$inst = as.numeric(as.character(test$inst))
# test$year = as.numeric(as.character(test$year))
# ggplot(test, aes(x = year, y = inst) ) + 
#   geom_line()
