# Make graphs of three measures of sorting over time for Figure 6

library(tidyverse)
library(scales)

setwd('~/projects/regionineq/')

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

for(i in 1:length(varnames)){  #2){ # 
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  #Bring in output
  hindex = read.csv(paste('output/ipums/',fname,'/sorting/hindex_',fname,'.csv',sep = ''))
  
  crossc = NULL
  for(yr in c('80','90','00','08','13')){
    crossyr = read.csv(paste('output/ipums/',fname,'/sorting/ipums_',yr,'crossvar_',fname,'.csv',sep = ''))
    crossc = rbind(crossc,crossyr) 
  }

  snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
  tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
  
  #Loop through samples
  for(s in 1:length(snames)){  #  2){  #
    sname = snames[s]
    tname = tnames[s]
    
    #Select sample
    hdta = filter(hindex, sample == sname)
    cdta = filter(crossc, samp == sname) %>% select(year,crosspct,zstratsm)

    #Merge together
    dta = merge(hdta, cdta, by = 'year')

    #Create long form dataset
    pdta = gather(dta,var,value,-year,-sample,-city)
    p80 = filter(pdta, year == 1980) %>% select(year, var, value)
    pdta = merge(pdta, p80, by = c('var'),suffixes = c('','_80'))
    pdta$pct80 = pdta$value / pdta$value_80
    
    pdta$var = factor(pdta$var, levels = c('crosspct','hindex','zstratsm'), labels = c(paste('Prop. of total income variation\nacross ',cityname,'s',sep = ''),'Rank-order information theory index (H)','Zhou\'s S (mean of 10\nsamples of 100,000 obs.)'))
    
    pdf(paste('output/ipums/',fname,'/sorting/',sname,'/ipums_sorting_',fname,'_',sname,'.pdf',sep = ''),height = 6) #
    gplt = ggplot(pdta, aes(x = year, y = pct80, lty = var)) +
      geom_line() +
      geom_point() +
      scale_y_continuous(name = 'Percent of 1980 value',limits = c(0, 2),labels = percent) +
      scale_x_continuous(name = 'Year') +
      scale_linetype_discrete(name = '') +
      theme_bw() +
      theme(legend.position = 'bottom') +
     guides(lty=guide_legend(nrow=2,byrow=TRUE)) 
      # ggtitle(paste("Change in alternative measures of income sorting across\n",cityname,'s over time',sep = ''))
    print(gplt)
    dev.off()
    
    #Output csv with individual statistics
    write.csv(pdta,paste('output/ipums/',fname,'/sorting/',sname,'/ipums_sorting_',fname,'_',sname,'.csv',sep = ''),row.names = F) #

  }#sample
}#var
    