#Compute Zhou stratification index - too long
#And top 1% concentration?
#And percent of variation that's across MSAs
#All over time

library(tidyverse)
library(strat)
library(Hmisc)
library(readstata13)

setwd('~/projects/regionineq/')

# #Bring in CPI
# cpirs = read.dta13('data/reference/cpirs.dta')
# cpi15 = cpirs[cpirs$year == 2015,'rate']

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('MSA','Commuting Zone')

for(i in 1:length(varnames)){  # 2){ # 
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  #Make dataframe to store national information from each sample
  natinfo =NULL 
  
  #Loop through years
  for(yr in c( '80','90','00','08','13')){ #
    fullyr = as.numeric(yr) + 2000
    if(yr %in% c('80','90')) fullyr = fullyr - 100
    print(paste(yr, vname))
    # cpiyr = cpirs %>% filter(year == fullyr - 1) %>% select(rate) %>% unlist()
    
    yrinfo = NULL
    
    dtfile = load(paste('data/ipums/ip',yr,fname,'.RData',sep = ''))
    dta = get(dtfile)
    
    #Make city identfier
    dta$city = dta[,vname]
    dta$cityname = as.character(dta[,cname])
    
    #Make normalized hh and family incomes
    dta = dta %>% group_by(year, serial,city) %>% mutate(hhsize = length(serial)) %>% data.frame()
    dta$hhincsqrt = dta$hhincome / sqrt(dta$hhsize)
    dta$hhincnorm = dta$hhincome / dta$hhsize
    
    dta = dta %>% group_by(year, serial,famunit,city) %>% mutate(fsize = length(serial),finc = sum(inctot,na.rm = T)) %>% data.frame()
    dta$fincsqrt = dta$finc / sqrt(dta$fsize)
    dta$fincnorm = dta$finc / dta$fsize
    
    #Inflate 
    # dta$hhincome = dta$hhincome * cpi15 / cpiyr
    # dta$inctot = dta$inctot * cpi15 / cpiyr
    # dta$ftotinc = dta$ftotinc * cpi15 / cpiyr
    
    #All samples are person-weighted
    #Samples are household, family, individual, individual male, individual female income
    #By working age (18-64) and prime age (25-54) categories, plus everyone for household and family income
    
    #Samples
    fam = dta
    hh = dta
    famsqrt = dta
    famnorm = dta
    hhsqrt = dta
    hhnorm = dta
    adult = dta %>% filter(age >= 18)
    adultmale = dta %>% filter(age >= 18, sex == 'Male')
    adultfemale = dta %>% filter(age >= 18, sex == 'Female')  
    
    #Loop through samples 
    samps = list(fam,famsqrt,famnorm,hh,hhsqrt,hhnorm,adult,adultmale,adultfemale)
    snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
    tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
    
    for(s in 1:length(samps)){ #  2){ #
      
      samp = samps[[s]]
      sname = snames[s]
      titname = tnames[s]
      print(sname)
      dir.create(paste('output/ipums/',fname,'/sorting/',sname,sep = ''))
      
      #Get weights - weighting by person in all cases
      samp$wt = samp$pcwt
      
      #Get income
      if(sname %in% c('fam')) samp$inc = samp$ftotinc
      if(sname %in% c('famsqrt')) samp$inc = samp$fincsqrt
      if(sname %in% c('famnorm')) samp$inc = samp$fincnorm
      if(sname %in% c('hh')) samp$inc = samp$hhincome
      if(sname %in% c('hhsqrt')) samp$inc = samp$hhincsqrt
      if(sname %in% c('hhnorm')) samp$inc = samp$hhincnorm
      if(sname %in% c('adult','adultmale','adultfemale')) samp$inc = samp$inctot
      
      #Drop missing incomes
      samp = filter(samp, !is.na(inc))
      
      #Drop negative incomes (or should I set to zero?)
      samp[samp$inc <0, 'inc'] = 0
      # samp = filter(samp, inc >= 0)
      
      #Zhou stratification index - this ran overnight with no visible progress
      #After discussing with Zhou, going to do 10 samples of 100,000 rows and take average. Test run varied from .1386 to .1433 for 2000
      # zstrat = strat(outcome = samp$inc, weights = samp$wt, strata = samp$city)
      zoutput = rep(0,10)
      for(sampnum in 1:10){
        samprows = sample(1:nrow(samp),100000)
        smallsample = samp[samprows,]
        zstratsm = strat(outcome = smallsample$inc, weights = smallsample$wt, strata = smallsample$city)
        zoutput[sampnum] = zstratsm$overall[1]
      }
        # zstratbig = strat(outcome = samp$inc, weights = samp$wt, strata = samp$city)
      zsm = mean(zoutput) #zstratsm$overall[1]
      zrange = (max(zoutput) - min(zoutput)) / min(zoutput)
      
      #Percentage of total variation in incomes across cities
      totvar = wtd.var(samp$inc, weight = samp$wt)
      
      #Residual SS - FE model crashes 
      #femod = lm(inc ~ cityname, data = samp, weights = wt)
      
      #Do by hand
      mmeans = samp %>% group_by(city) %>% summarise(citymean = wtd.mean(inc, weight = wt), citypop = sum(wt)) %>% data.frame()
      samp = merge(samp, mmeans, by = 'city')
      samp$resid = samp$inc - samp$citymean
      withincity = wtd.var(samp$resid, weight = samp$wt)
      
      #Get variance of mmeans
      crosscity = wtd.var(mmeans$citymean, mmeans$citypop)
      crosspct = crosscity / totvar
      
      #within + cross city sum to total variance
      stopifnot(abs(withincity + crosscity - totvar) < 1)
      
      #Output percentage of total variantion that is cross-city
      natinfo = rbind(natinfo, c(fname,sname, fullyr,crosspct,totvar,zsm,zrange))

      #Export every time
      natdf = data.frame(natinfo)
      names(natdf) = c('city','samp','year','crosspct','totvar','zstratsm','zrange')
      write.csv(natdf, paste('output/ipums/',fname,'/sorting/ipums_crossvar_',fname,'.csv',sep = ''),row.names = F)
      
      yrinfo = rbind(yrinfo,c(fname,sname, fullyr,crosspct,totvar,zsm,zrange))
      yrdf = data.frame(yrinfo)
      names(yrdf) = c('city','samp','year','crosspct','totvar','zstratsm','zrange')
      write.csv(yrdf, paste('output/ipums/',fname,'/sorting/ipums_',yr,'crossvar_',fname,'.csv',sep = ''),row.names = F)
      
      
    } #samp
  } #year
  
} #var
      