#Take output of ipums_percentiles and use to run counterfactual simulations
#Produce Figures 7 and 8

library(dplyr)
library(Hmisc)
library(maps)
library(stringr)
library(RColorBrewer)
library(scales)
library(readstata13)
library(ggplot2)
library(reshape2)
library(ggthemes)

setwd('~/projects/regionineq/')

#Crosswalk from MSAs to counties in the maps
msas = read.csv('data/mable/cntymsakey.csv')
msas$reg = msas$polyname

cntys = map_data('county')
cntys$reg = paste(cntys$region, cntys$subregion, sep = ',')
cntys = merge(cntys,msas,by = 'reg')

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

for(i in 1:length(varnames)){  #   2){ # 
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  cntys$city = cntys[,vname]
  
  natldta = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo.csv',sep = ''))
  natldta$year = natldta$fullyr
  natldta$med = natldta$median
  
  snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
  tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
  
  dir.create(paste('output/ipums/',fname,'/counterfactuals',sep = ''))
  
  cfsig = NULL
  cfbetas = NULL
  
  for(s in 1:length(snames)) { #2){ #
    sname = snames[s]
    tname = tnames[s]
    
    #Make dataframe to store national information from each sample
    natinfo80 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_1980.csv',sep = ''))
    natinfo90 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_1990.csv',sep = ''))
    natinfo00 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2000.csv',sep = ''))
    natinfo08 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2008.csv',sep = ''))
    natinfo13 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2013.csv',sep = ''))
    
    natinfo_full = rbind(natinfo80,natinfo90,natinfo00, natinfo08, natinfo13)

    #Make city year by year data frame
    city80 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_1980.csv',sep = ''))
    city90 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_1990.csv',sep = ''))
    city00 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2000.csv',sep = ''))
    city10 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2008.csv',sep = ''))
    city13 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2013.csv',sep = ''))
    
    city_full = rbind(city80, city90, city00, city10, city13)
    
    #Add city variable
    city_full$city = city_full[,fname]
    
    dir.create(paste('output/ipums/',fname,'/counterfactuals/',sname,sep = ''))

    for(psize in c(1,2,4,5,10)){ # 2){ #
      
        st = 'mean'
        stname = 'mean'
        print(paste(vname,sname,psize,st))
        
        #Get national mean and median 
        natlmeans = filter(natldta,samp == sname)[,c('year',st)]
        names(natlmeans) = c('year','nattotstat')
        
        natinfo = natinfo_full
        city = city_full
        natinfo$stat = natinfo[,st]
        city$stat = city[,st]
        
        #Make grouped percentiles
        natinfo$pctgrp = ceil(natinfo$p / psize)
        natinfo = natinfo %>% group_by(pctgrp,year) %>% summarise(
          count = sum(cnt), pct = sum(pct), stat = wtd.mean(stat, weights=cnt,na.rm = T)) %>% data.frame()
          
        #Here replace NAs with zeros. Shouldn't matter because weights are zero?
        natinfo$stat = replace(natinfo$stat, is.na(natinfo$stat),0)
        
        #Make national percentile mean relative to national total mean
        natinfo = merge(natinfo, natlmeans, by = 'year')
        natinfo$nat_pctl_rel_tot = natinfo$stat / natinfo$nattotstat
        
        #Make grouped city percentiles - note that we exclude NAs because they appear in region-percentiles with no observations.
        city$pctgrp = ceil(city$p / psize)
        city = city %>% group_by(city,pctgrp,year) %>% summarise(
          count = sum(cnt), pct = sum(pct), stat = wtd.mean(stat, weights=cnt,na.rm =T)) %>% data.frame()
        
        #Add national year and national percentile means
        city = merge(city, natinfo[,c('year','pctgrp','stat','nat_pctl_rel_tot','nattotstat','pct')], by = c('year','pctgrp'), suffixes = c('_city','_nat'), all.x = T)
        
        #Make sure merge worked
        stopifnot(!is.na(city$stat_nat))

        #If a city has no people people in a given percentile bucket, use national level isntead
        #This will drop out in the observed versions, because the population weight will be zero
        #But in the counterfactuals population may be added, so just use national level
        city[is.na(city$stat_city),'stat_city'] = city[is.na(city$stat_city),'stat_nat']
        
        #Make city percentile stat relative to national percentile stat
        city$city_pctl_rel_nat = city$stat_city / city$stat_nat
        city[city$stat_nat == 0 & city$stat_city == 0,'city_pctl_rel_nat'] = 1 #In cases where both are zero set ratio to 1  
        stopifnot(!is.na(city$city_pctl_rel_nat))
        
        #Add 1980 values of city pct stat relative to national pct stat, 
        #national pct stat relative to national overall average stat, 
        #and percent of city population in percentile
        c1980 = filter(city, year == 1980) %>% select(city,pctgrp,nat_pctl_rel_tot,city_pctl_rel_nat,pct_city)
        city = merge(city,c1980, by = c('city','pctgrp'), suffixes = c('_XX','_80'),all.x = T)
        
        #Drop cities without data for 1980
        table(city[is.na(city$city_pctl_rel_nat_80),'city']) 
        #MSAs: 24380 - Grants, NM Micropolitan Statistical Area, which is one county that was formed after 1980. Drop.
        #CZs: 34101 - Sand Point, AK, which also had difficulty assigning earler. Drop
        city = filter(city, !is.na(city_pctl_rel_nat_80))
        
        #Now, do two sets of counterfactual incomes
        cfacts = city %>% group_by(city,year) %>% summarise(pop = sum(count),actual_incstat = wtd.mean(stat_city,weights = count),natl_incstat = wtd.mean(nattotstat,count)) %>% data.frame() #Remember stat_nat is national for that quantile, nattotstat is national overall
        for(gcfact in c('XX','80')){
          for(icfact in c('XX','80')){
            #geography is both pct_city and city_pctle_rel_nat (city_pctle_rel_nat captures differences in distirbution within buckets)
            #Income is just natl_pctl_rel_tot
            city$geovar = city[,paste('pct_city_',gcfact,sep = '')]
            city$incvar = city[,paste('city_pctl_rel_nat_',gcfact,sep = '')] * city[,paste('nat_pctl_rel_tot_',icfact,sep = '')]
            
            if(st == 'mean') cf = city %>% group_by(city,year) %>% summarise(cf = wtd.mean(incvar,weights = geovar*1000)) %>% data.frame()
            names(cf) = c('city','year',paste('cfact_inc_g',gcfact,'_i',icfact,sep = ''))
            cfacts = merge(cfacts, cf, by = c('city','year'),all.x = T )
          }
        }
        cfacts$actual_statrel = cfacts$actual_incstat / cfacts$natl_incstat
        
        #Export counterfactual incomes list
        write.csv(cfacts,paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_',fname,'_',sname,'_',st,'_',psize,'.csv',sep = ''),row.names = F)           
           
        #Get alternative year populations
        pop80 = filter(cfacts, year == 1980) %>% select(city, pop)
        pop13 = filter(cfacts, year == 2013) %>% select(city, pop)
        cfacts = merge(cfacts, pop80, by = 'city',suffixes = c('','_80'))
        cfacts = merge(cfacts, pop13, by = 'city',suffixes = c('','_13'))
        cfacts$pop_cur = cfacts$pop
        
        #Loop over popyear for sigma graphs
        for(popyr in c('cur','80','13')){
          cfacts$pop = cfacts[,paste('pop_',popyr,sep = '')]
        
          #Make sd relative to stat over time
          cfactsds = cfacts %>% group_by(year) %>% summarise(
            sd_actual = sqrt(wtd.var(actual_statrel, weights = pop)),
            sd_gXX_iXX = sqrt(wtd.var(cfact_inc_gXX_iXX, weights = pop)),
            sd_g80_i80 = sqrt(wtd.var(cfact_inc_g80_i80, weights = pop)),
            sd_g80_iXX = sqrt(wtd.var(cfact_inc_g80_iXX, weights = pop)),
            sd_gXX_i80 = sqrt(wtd.var(cfact_inc_gXX_i80, weights = pop))
          ) %>% data.frame()
            
          #Plot
          pltdta = cfactsds %>% melt(id.vars = 'year')
          sddta = pltdta
          pltdta = filter(pltdta, variable != 'sd_gXX_iXX')
          pltdta$variable = factor(pltdta$variable, levels = paste('sd_',c('actual','g80_i80','g80_iXX','gXX_i80'),sep = ''),labels = c('Observed geography, observed income','1980 geography, 1980 income','1980 geography, observed income','Observed geography, 1980 income'))
          pdf(paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_sigma_',fname,'_',sname,'_',st,'_sd_',psize,'_pop',popyr,'.pdf',sep = ''),height = 6)
          gplt = ggplot(pltdta, aes(x = year, y = value, lty = variable, group = variable)) +
            geom_line() +
            geom_point() +
            scale_y_continuous(limits = c(0,1.1 * max(pltdta$value)),name = paste('Coefficient of variation')) +
            theme_bw() +
            scale_x_continuous(name = 'Year')+
            guides(lty = guide_legend(nrow = 2,title = NULL)) +
            theme(legend.position = 'bottom') #+ ,plot.title = element_text(hjust = 0.5)
            # ggtitle(paste('Counterfactual variation in coefficient of variation\nof',cityname,tname,stname,'incomes'))
          print(gplt)
          dev.off()
        
          #Make IQR and 10-90 difference relative to mean over time
          cfactpcts = cfacts %>% group_by(year) %>% summarise(
            p10_actual = wtd.quantile(actual_statrel,weights = pop, probs = .1),
            p10_gXX_iXX = wtd.quantile(cfact_inc_gXX_iXX,weights = pop, probs = .1),
            p10_g80_i80 = wtd.quantile(cfact_inc_g80_i80,weights = pop, probs = .1),
            p10_g80_iXX = wtd.quantile(cfact_inc_g80_iXX,weights = pop, probs = .1),
            p10_gXX_i80 = wtd.quantile(cfact_inc_gXX_i80,weights = pop, probs = .1),
            p25_actual = wtd.quantile(actual_statrel,weights = pop, probs = .25),
            p25_gXX_iXX = wtd.quantile(cfact_inc_gXX_iXX,weights = pop, probs = .25),
            p25_g80_i80 = wtd.quantile(cfact_inc_g80_i80,weights = pop, probs = .25),
            p25_g80_iXX = wtd.quantile(cfact_inc_g80_iXX,weights = pop, probs = .25),
            p25_gXX_i80 = wtd.quantile(cfact_inc_gXX_i80,weights = pop, probs = .25),
            p75_actual = wtd.quantile(actual_statrel,weights = pop, probs = .75),
            p75_gXX_iXX = wtd.quantile(cfact_inc_gXX_iXX,weights = pop, probs = .75),
            p75_g80_i80 = wtd.quantile(cfact_inc_g80_i80,weights = pop, probs = .75),
            p75_g80_iXX = wtd.quantile(cfact_inc_g80_iXX,weights = pop, probs = .75),
            p75_gXX_i80 = wtd.quantile(cfact_inc_gXX_i80,weights = pop, probs = .75),
            p90_actual = wtd.quantile(actual_statrel,weights = pop, probs = .9),
            p90_gXX_iXX = wtd.quantile(cfact_inc_gXX_iXX,weights = pop, probs = .9),
            p90_g80_i80 = wtd.quantile(cfact_inc_g80_i80,weights = pop, probs = .9),
            p90_g80_iXX = wtd.quantile(cfact_inc_g80_iXX,weights = pop, probs = .9),
            p90_gXX_i80 = wtd.quantile(cfact_inc_gXX_i80,weights = pop, probs = .9)
          ) %>% data.frame()
        
          for(v in c('actual','gXX_iXX','g80_i80','g80_iXX','gXX_i80')){
            cfactpcts[,paste('iqr_',v,sep = '')] = cfactpcts[,paste('p75_',v,sep = '')] -cfactpcts[,paste('p25_',v,sep = '')]  
            cfactpcts[,paste('p19_',v,sep = '')] = cfactpcts[,paste('p90_',v,sep = '')] -cfactpcts[,paste('p10_',v,sep = '')]  
          }
          
          #Plot IQR
          pltdta = cfactpcts[,c('year',grep('iqr',names(cfactpcts),value = T))]
          pltdta = pltdta %>% melt(id.vars = 'year')
          iqrdta = pltdta
          
          pltdta = filter(pltdta, variable != 'iqr_gXX_iXX')
          pltdta$variable = factor(pltdta$variable, levels = paste('iqr_',c('actual','g80_i80','g80_iXX','gXX_i80'),sep = ''),labels = c('Observed geography, observed income','1980 geography, 1980 income','1980 geography, observed income','Observed geography, 1980 income'))
          
          pdf(paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_sigma_',fname,'_',sname,'_',st,'_iqr_',psize,'_pop',popyr,'.pdf',sep = ''),height = 6)
          gplt = ggplot(pltdta, aes(x = year, y = value, lty = variable, group = variable)) +
            geom_line() +
            geom_point() +
            scale_y_continuous(limits = c(0,1.1 * max(pltdta$value)),name = paste('IQR of',cityname,tname,stname,'income\nrelative to national',stname)) +
            theme_bw() +
            scale_x_continuous(name = 'Year')+
            guides(lty = guide_legend(nrow = 2,title = NULL)) +
            theme(legend.position = 'bottom') 
            # ggtitle(paste('Counterfactual variation in IQR\nof',cityname,stname,tname,'incomes'))
          print(gplt)
          dev.off()
        
          #Plot 90-10 range
          pltdta = cfactpcts[,c('year',grep('p19',names(cfactpcts),value = T))]
          pltdta = pltdta %>% melt(id.vars = 'year')
          p91dta = pltdta
          
          pltdta = filter(pltdta, variable != 'p19_gXX_iXX')
          pltdta$variable = factor(pltdta$variable, levels = paste('p19_',c('actual','g80_i80','g80_iXX','gXX_i80'),sep = ''),labels = c('Observed geography, observed income','1980 geography, 1980 income','1980 geography, observed income','Observed geography, 1980 income'))
          
          pdf(paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_sigma_',fname,'_',sname,'_',st,'_p19_',psize,'_pop',popyr,'.pdf',sep = ''),height = 6)
          gplt = ggplot(pltdta, aes(x = year, y = value, lty = variable, group = variable)) +
            geom_line() +
            geom_point() +
            scale_y_continuous(limits = c(0,1.1 * max(pltdta$value)),name = paste('90-10 range of',cityname,tname,stname,'income\nrelative to national',stname)) +
            scale_x_continuous(name = 'Year')+
            guides(lty = guide_legend(nrow = 2,title = NULL)) +
            theme_bw() +
            theme(legend.position = 'bottom') 
            # ggtitle(paste('Counterfactual variation in 90-10 range\nof',cityname,stname,tname,'income'))
          print(gplt)
          dev.off()
        
          #Output data frame of the various statistics 
          stdta = rbind(sddta,iqrdta,p91dta)
          stdta = dcast(stdta, year ~ variable,value.var = 'value')
          stdta$samp = sname
          stdta$stat = st
          stdta$psize = psize
          stdta$popyr = popyr
          
          cfsig = rbind(cfsig, stdta)
        }
          
        
        #Maps and betas don't use popyear
        incnames = c('1980 income at each percentile',paste(2013,'income at each percentile'))
        geonames = c('1980 geographic distribution',paste(2013,'geographic distribution'))
        incyears = c('80','XX')
        
        betas = c(sname,st,psize)
      
        for(inum in 1:length(incyears)){
          for(gnum in 1:length(incyears)){
            
            #Make variables
            inu = incyears[inum]
            gnu = incyears[gnum]
            iname = incnames[inum]
            gname = geonames[gnum]
            
            #Get outcome variable of interest
            cfacts$varint = cfacts[,paste('cfact_inc_g',gnu,'_i',inu,sep = '')]
            
            #Categorize into income buckets
            cfacts$varcat = cut(cfacts$varint,breaks = c(0,.8,.9,1.1,1.2,10000), labels = c('<80%','80-90%','90-110%','110-120%','>120%'))
            
            cfacts13 = filter(cfacts, year == 2013)
            mapdat = merge(cntys,cfacts13,by = 'city', all.x = T)
            
            stopifnot(!is.na(mapdat$varcat) | mapdat$city == '24380') #Grants, NM, which is a 1-county MSA that didn't use to exist
            
            #Map data
            mapdat = mapdat %>% arrange(order)
            
            # maptit = paste('Simulated',cityname,stname,tname,'income\n',iname,'-',gname)#,'\npercentile group size:',psize )
            #Map title for 
            maptit = ''
            if(inu == '80' & gnu == 'XX') maptit = 'A: 2013 sorting, 1980 inequality'
            if(inu == 'XX' & gnu == '80') maptit = 'B: 1980 sorting, 2013 inequality'
            
            
            #Categorical map
            pdf(paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_map_',fname,'_',sname,'_',st,'_g',gnu,'_i',inu,'_',psize,'.pdf',sep = ''),height = 4.5)
            gplot = ggplot(data = mapdat, aes(long, lat, group = group)) +
              geom_polygon( aes(fill = varcat)) +
              coord_map("polyconic") +
              scale_fill_grey(start = .9,end = .1,name ='Mean family income\nrelative to nation') +
              theme_map() +
              theme(legend.position = 'bottom',legend.justification = 'center') + #plot.title = element_text(hjust = 0.5) 
              ggtitle(maptit)
            print(gplot)
            dev.off()
          
            #Do beta convergence
            cfacts80 = filter(cfacts, year == 1980)
            cfbeta = merge(cfacts13,cfacts80, by = 'city',suffixes = c('','_80'))
            cfbeta$inc13 = cfbeta$natl_incstat * cfbeta$varint
            cfbeta$inc80 = cfbeta$natl_incstat_80 * cfbeta$varint_80
            cfbeta$anngr = (cfbeta$inc13 / cfbeta$inc80) ^ (1/33) - 1
            
            #Fit linear model, using 1980 weights as in original
            mod1 = lm(formula = anngr ~ inc80,weights = pop_80,data = cfbeta)
  
            #For export
            betas = c(betas,mod1$coefficients[2])
            
            #Title
            betatit = paste('Simulated beta convergence\nin ',cityname,' ',stname,' ',tname,' income, 1980-2013\n',iname,' - ',gname,sep = '')
            
            #Plot annual growth rate vs
            pdf(paste('output/ipums/',fname,'/counterfactuals/',sname,'/cf_beta_',fname,'_',sname,'_',st,'_g',gnu,'_i',inu,'_',psize,'.pdf',sep = ''),height = 6)
            gplt = ggplot(cfbeta, aes(x = inc80, y = anngr,size = pop_80)) +
              geom_point(alpha = .3) +
              theme_bw() +
              scale_x_continuous(labels = dollar, name = paste('1980',cityname,stname,tname,'income')) +
              scale_y_continuous(labels = percent, name = paste('Annualized growth rate in',stname,'income, 1980-2013')) +
              geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2],lty = 2) +
              scale_size_continuous(range = c(0,5), labels = comma, name = paste('1980',cityname,'\npopulation'))  +
              # ggtitle(betatit)
              theme(legend.position = 'bottom')
            
            print(gplt)
            dev.off()
            
            
        } #inum
      } #gnum
        cfbetas = rbind(cfbetas,betas)
    } #psize
  } #samp
  cfbetas = data.frame(cfbetas)
  names(cfbetas) = c('samp','stat','psize','i80g80','i80gXX','iXXg80','iXXgXX')
  write.csv(cfbetas,paste('output/ipums/',fname,'/counterfactuals/cf_beta_',fname,'.csv',sep = ''),row.names = F)
  
  # cfsig = data.frame(cfsig)
  write.csv(cfsig, paste('output/ipums/',fname,'/counterfactuals/cf_sig_',fname,'.csv',sep = ''),row.names = F)
  
} #var
