#Take BEA county per capita income data and create MSA and CZ averages

library(dplyr)
library(readstata13)
library(stringr)
library(reshape2)

setwd('~/projects/regionineq/')
dir.create('output/bea/dta')

#Bring in inflation
cpirs = read.dta13('data/reference/cpirs.dta')
cpi15 = cpirs[cpirs$year == 2015,'rate']

#Bring in msas
msas = read.csv('output/mable/county_msa_list.csv')
msas$fips = str_pad(msas$fips, 5,pad = '0')
msas = select(msas, fips, msa, msaname)

#Manually add counties added since 1990
msas[nrow(msas)+1,] = c('15901','27980','Kahului-Wailuku-Lahaina, HI') #Maui and Kalawao County HI -> Kahului
msas[nrow(msas)+1,] = c('46102','46','South Dakota non-Metro') #Ogalala Lakota County -> Gordon SD
msas[nrow(msas)+1,] = c('55901','43020','Shawano, WI') #Shawano + Menominee County -> Shawano WI
msas[nrow(msas)+1,] = c('02201','2','Alaska non-Metro') #Three Alaska counties that aren't current to the MSAs list are all non-metro according to https://en.wikipedia.org/wiki/Alaska_statistical_areas
msas[nrow(msas)+1,] = c('02231','2','Alaska non-Metro') 
msas[nrow(msas)+1,] = c('02280','2','Alaska non-Metro') 

#Bring in CZs
czs = read.csv('data/reference/czlma903.csv')
czs$fips = str_pad(czs$County.FIPS.Code, 5,pad = '0')
czs$czone = str_pad(czs$CZ90,5,pad = '0')
czs = select(czs, fips, czone)

#Manually add counties added since 1990
czs[nrow(czs)+1,] = c('08014','28900') #Broomfield County CO -> Denver CZ
czs[nrow(czs)+1,] = c('12086','07000') #Miami-Dade County FL -> Miami CZ
czs[nrow(czs)+1,] = c('15901','34703') #Maui and Kalawao County HI -> Kahului
czs[nrow(czs)+1,] = c('46102','27704') #Ogalala Lakota County -> Gordon SD
czs[nrow(czs)+1,] = c('55901','22602') #Shawano + Menominee County -> Shawano WI

#Bring in CZ names from Chetty et al 2014 online data tables
cznames = read.csv('data/reference/cznames_chetty2014.csv')
cznames$czone = str_pad(cznames$CZ,5,pad = '0')
cznames$czname = paste(cznames$CZ.Name,cznames$State,sep = ', ')
cznames = select(cznames, czone, czname)
czs = merge(czs, cznames, by = 'czone', all = T)
stopifnot(!is.na(czs$czname))

#Bring in VA crosswalk from BEA
vaxw = read.csv('data/bea/st51_va_cou_xwalk.csv')
vaxw$st = as.character(vaxw$st)
vaxw$cnty = str_pad(as.character(vaxw$county),width = 3,pad = '0')
vaxw$trufips = paste(vaxw$st, vaxw$cnty, sep = '')
vaxw$vacombine = as.character(vaxw$vacombine)
vaxw = vaxw %>% filter(!is.na(vaxw$vacombine))

#Drop duplicates for data anlaysis purposes (may want later for mapping)
vamerge = vaxw[!duplicated(vaxw$vacombine),]

#Bring in AK crosswalk
akxw = read.csv('data/bea/st02_alaska_lookup.csv')
akxw$trufips = str_pad(akxw$Czfips, width = 5, pad = '0')
akxw$akcombine = str_pad(akxw$GeoFIPS, width = 5, pad = '0')
akxw = akxw %>% filter(!is.na(akxw$akcombine))

#Bring in main BEA data
dta = read.csv('data/bea/CA1/CA1_1969_2015__ALL_AREAS.csv',colClasses = 'character')
dta = dta[,setdiff(names(dta),c('Region','Table','IndustryClassification','Description'))] #Drop variables we don't need
dta = dta[!is.na(as.numeric(as.character(dta$GeoFIPS))),] #Drop notes from bottom
stopifnot(nrow(dta) == 9594)
dta$fips = str_pad(as.character(dta$GeoFIPS), width = 5,pad = '0')
dta$st = substr(dta$fips,1,2)
dta$cnty = substr(dta$fips, 3,5)

#Drop regions
dta = dta[!(dta$st %in% c('91','92','93','94','95','96','97','98')),] 
stopifnot(nrow(dta) == 9570)
dta[,4:50] = apply(dta[,4:50],2,as.numeric)

#Convert to long form and inflate incomes
dtalong = melt(dta,id.vars = c('GeoFIPS','GeoName','LineCode','fips','st','cnty'))
dtalong$year = as.character(dtalong$variable) %>% str_replace('X','') %>% as.numeric()

#Cast to wide form with one row per state/county-year having population, income per capita, total income
dtawide = dcast(dtalong %>% select(year, GeoName, fips, st, cnty, value, LineCode), year + GeoName + fips+st + cnty ~ LineCode,value.var = 'value' )
names(dtawide) = c('year', 'geoname','fips','st','cnty','gdp','pop','incpc')
dtawide = merge(dtawide, cpirs, by = 'year')
dtawide$gdptot = dtawide$gdp * cpi15 / dtawide$rate
dtawide$gdppc = dtawide$incpc * cpi15 / dtawide$rate

#Make national yearly dataset
npc = filter(dtawide, st == '00')
npc = rename(npc, popnat = pop, gdppcnat = gdppc, gdptotnat = gdptot)
npc = npc[,c('year','gdptotnat','popnat','gdppcnat')]

#Export
write.csv(npc,'output/bea/dta/nat_gdppc.csv',row.names = F)

#Make county dataset
cpc = filter(dtawide, cnty != '000', st != '00')

#Add on virginia
cpc = merge(cpc, vamerge[,c('vacombine','trufips')], all.x = T, by.x = 'fips', by.y = 'vacombine')
cpc[!is.na(cpc$trufips),'fips'] = cpc[!is.na(cpc$trufips),'trufips']
cpc = cpc[,setdiff(names(cpc),'trufips')]

#Add on alaska
cpc = merge(cpc, akxw[,c('akcombine','trufips')], all.x = T, by.x = 'fips', by.y = 'akcombine')
cpc[!is.na(cpc$trufips),'fips'] = cpc[!is.na(cpc$trufips),'trufips']
cpc = cpc[,setdiff(names(cpc),'trufips')]

#Add in msas
nrow(cpc)
cpc = merge(cpc, msas, by = 'fips', all.x = T)
nrow(cpc)
stopifnot(!is.na(cpc$msaname))

#Add in CZs
nrow(cpc)
cpc = merge(cpc, czs, by = 'fips', all.x = T)
nrow(cpc)
stopifnot(!is.na(cpc$czname))

#Make MSA dataset
cdta = cpc %>% group_by(msa,msaname, year) %>% summarise(pop = sum(pop, na.rm = T), 
          gdp = sum(gdptot, na.rm = T)) %>% data.frame()

#Calcuate personal income per capita for entire city at once
cdta$gdppc = cdta$gdp * 1000 / cdta$pop

#Check that MSA data sums to the national values
czn = cdta %>% group_by(year) %>% summarise(gdpnat = sum(gdp, na.rm = T), pop = sum(pop, na.rm = T)) %>% data.frame()
ncheck = merge(npc, czn, by = 'year')
sum(abs(ncheck$gdptotnat - ncheck$gdpnat)) #7.629395e-06
sum(abs(ncheck$popnat - ncheck$pop)) #0
stopifnot(sum(abs(ncheck$gdptotnat - ncheck$gdpnat)) < .0005)
stopifnot(sum(abs(ncheck$popnat - ncheck$pop)) < .0005)

#Calculate GDP per capita in each MSA relative to nation
cdta = merge(cdta, npc, by = 'year',all.x = T)
cdta$gdprel = cdta$gdppc / cdta$gdppcnat

#Cut into 5 categories
cdta$inccat = cut(cdta$gdprel,breaks = c(0,.8,.9,1.1,1.2,100000), labels = c("<80%", "80-90%", "90-110%", "110-120%", ">120%"))

#Export
write.csv(cdta, 'output/bea/dta/msa_gdppc.csv', row.names = F)

#Make CZ dataset
cdta = cpc %>% group_by(czone,czname, year) %>% summarise(pop = sum(pop, na.rm = T), 
                                                          gdp = sum(gdptot, na.rm = T)) %>% data.frame()

#Calcuate personal income per capita for entire city at once
cdta$gdppc = cdta$gdp * 1000 / cdta$pop

#Check that it sums to the national values
czn = cdta %>%group_by(year) %>%summarise(gdpnat = sum(gdp, na.rm = T), pop = sum(pop, na.rm = T)) %>% data.frame()
ncheck = merge(npc, czn, by = 'year')
sum(abs(ncheck$gdptotnat - ncheck$gdpnat)) #5.722046e-06
sum(abs(ncheck$popnat - ncheck$pop)) #0
stopifnot(sum(abs(ncheck$gdptotnat - ncheck$gdpnat)) < .0005)
stopifnot(sum(abs(ncheck$popnat - ncheck$pop)) < .0005)

#Calculate GDP per capita relative to nation
cdta = merge(cdta, npc, by = 'year',all.x = T)
cdta$gdprel = cdta$gdppc / cdta$gdppcnat

#Cut into 5 categories
cdta$inccat = cut(cdta$gdprel,breaks = c(0,.8,.9,1.1,1.2,100000), labels = c("<80%", "80-90%", "90-110%", "110-120%", ">120%"))

#Export
write.csv(cdta, 'output/bea/dta/cz_gdppc.csv', row.names = F)
