#Compute rank order information segregation index
#Separate file to do percent of total variation that's across cities and Zhou's index

library(dplyr)
library(Hmisc)
library(maps)
library(stringr)
library(RColorBrewer)
library(scales)
library(readstata13)
library(ggplot2)

setwd('~/projects/regionineq/')


#Make entropy function - it approaches zero as the proportion in one group gets high, so set it at 0 if the proportion is 1
entropy = function(pctless) {
  if(pctless !=0) outpt = pctless * log(1/pctless) / log(2) + (1-pctless) * log(1/(1-pctless) / log(2))
  if(pctless == 0 | pctless == 1) outpt = 0
  return(outpt)
}
#I think I have to fix how this handles cities with everyone below a given threshold, which happens for p99 sometimes
#It looks like the entropy function approaches zero as everybody gets into one of the categories

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

for(i in 1:length(varnames)){  #2){ # 
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  #Make big directory
  dir.create(paste('output/ipums/',fname,'/sorting',sep = ''))
  
  #Make dataframe to store national information from each sample
  natinfo =NULL
  
  # cntys$city = cntys[,vname]
  
  # natldta = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo.csv',sep = ''))
  # natldta$year = natldta$fullyr
  # natldta$med = natldta$median
  
  snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
  tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
  
  for(s in 1:length(snames)) {
    sname = snames[s]
    tname = tnames[s]

    dir.create(paste('output/ipums/',fname,'/sorting/',sname,sep = ''))
    print(paste(fname,sname))
    
    #Output dataframes
    
    for(yr in c('1980','1990','2000','2008','2013')) {
      print(yr)
      
      dta =  read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_',yr,'.csv',sep = ''))
      dta$city = dta[,fname]
      dta$cityname = dta[,cname]
      dta$p = as.numeric(dta$p)
      # dta = dta %>% group_by(city) %>% mutate(totpop = sum(cnt)) %>% data.frame()
      
      #Total population
      natpop = sum(dta$cnt)
      citypop = dta %>% group_by(city) %>% summarise(pop = sum(cnt))
      
      # yrn = as.numeric(gsub('a|b','',yr))
      # if(yrn >= 50) yrn = 1900 + yrn
      # if(yrn <50) yrn = 2000 + yrn
      
      # dta$year = yrn
      # allyrs = rbind(allyrs, dta)
      
      #Compute  entropy
      natentstats = NULL
      for(pc in 1:99){ #Don't include 100 because everyone's below it so weight is 0 and entropy is NaN
        
        #print(pc)
        
        #Get national pop lp and entropy
        natlp = filter(dta, p <= pc) %>% summarise(poplp = sum(cnt)) %>% select(poplp) %>% unlist()
        natpctlp = natlp / natpop
        natent = entropy(natpctlp)
        
        #Get city percent of population < p 
        citylp = filter(dta, p <= pc) %>% group_by(city) %>% summarise(poplp = sum(cnt)) %>% data.frame()
        citylp = merge(citylp, citypop, by = 'city')
        citylp$pctlp = citylp$poplp / citylp$pop
        
        #City entropy
        citylp$ent = sapply(citylp$pctlp, FUN = entropy)
        
        #City entropy weighted by population to sum, see Reardon and Bischoff 2011 eq 2 (this is only the part within the sum)
        citylp$entcont = (citylp$pop * citylp$ent) 
        
        #H(p) - see Reardon and Bischoff 2011 eq 2
        hp = 1 - sum(citylp$entcont) / (natent*natpop)
        natentstats = rbind(natentstats, c(pc,natpctlp,natent,hp))
        
      }
      
      #Make data frame of the compiled percentile stats
      natentdf = data.frame(natentstats)
      names(natentdf) = c('pctile','pctless','natent','hp')
      natentdf$hp = as.numeric(as.character(natentdf$hp))
      natentdf$natent = as.numeric(as.character(natentdf$natent))
      natentdf$pctless = as.numeric(as.character(natentdf$pctless))
      natentdf$pctile = as.numeric(as.character(natentdf$pctile))

      #Compute overall segregation measure - See R&B 2011 eq 3
      natseg = 2 * log(2) * sum(natentdf$hp * natentdf$natent) / sum(natentdf$natent)
      stopifnot(abs(natseg - 2 * log(2) * wtd.mean(natentdf$hp, natentdf$natent)) < .000001) #Confirm sum is the same as weighted mean
      
      #Export percentiles and add overall segregation to total 
      write.csv(natentdf, paste('output/ipums/',fname,'/sorting/',sname,'/natent_',sname,'_',yr,'.csv',sep = ''),row.names = F)
      natinfo = rbind(natinfo,c(fname,sname,yr,natseg))
      
      #Make graph of segregation of each percentile
      pdf(paste('output/ipums/',fname,'/sorting/',sname,'/ipums_seg_pctile_',yr,'_',sname,'.pdf',sep = ''), height = 5)
      gplt = ggplot(natentdf, aes(x = pctile, y = hp)) +
        geom_line() +
        theme_bw() +
        scale_y_continuous(limits = c(0,1.1*max(natentdf$hp)),name = 'Information theory segregation of those above vs. below percentile') +
        scale_x_continuous(name = 'Percentile of national income')+
        ggtitle(paste(cityname,tname,'income segregation\nby percentile,',yr))
      print(gplt)
      dev.off()
      
      
    } #Year
  }# Sample
  natinfodf = data.frame(natinfo)
  names(natinfodf) = c('city','sample','year','hindex')
  write.csv(natinfodf, paste('output/ipums/',fname,'/sorting/hindex_',fname,'.csv',sep = ''),row.names = F)
}#variable
    