# Create crosswalk from Census blocks to legally incorporated jurisdictions

library(tidyverse)


# Make directory for export
dir.create('intermediate_data/fiscal_units')


### Prologue: Bring in block data at top because it takes so long

# Bring in block data (takes a long time to bring in)
dblock = read.csv('nonproprietary_data/nhgis/nhgis0054_csv/nhgis0054_ds248_2020_block.csv')


### Part 1: Bring in data on different types of fiscal units ###
## Definition 1: smallest independent fiscal units ##
# Basically, township + town + municipality + county

# First, bring in population data at place and county subdivision level, which includes the FUNCSTAT code for whether they're a functional government

dplaces = read.csv('nonproprietary_data/nhgis/nhgis0054_csv/nhgis0054_ds248_2020_place.csv')

# Keep geography variables and population and area to compare later, plus LSAD and FUNCSTAT 
dplaces %>% count(FUNCSTAT,LSADC)

# Keep everything that has FUNCSTAT A = active general purpose local government
dplaces %>% filter(FUNCSTAT == "B") #Baton Rouge and Lafayette cities. These are consolidated with their counties, so want to use county data instead
dplaces = mutate(dplaces, 
                 st = str_pad(STATEA,2,pad = '0'),
                 place = str_pad(PLACEA,5,pad = '0'))

cplaces = filter(dplaces, FUNCSTAT == 'A') %>% select(st,place,place_name = NAME,place_LSAD = LSADC,place_AREALAND = AREALAND,place_pop = U7B001) #GISJOIN,GEOID, 

dim(cplaces) #19481 

# Now get County subdivisions
dcsd = read.csv('nonproprietary_data/nhgis/nhgis0054_csv/nhgis0054_ds248_2020_cty_sub.csv')

# Make clean combined county + subdivision for merging
dcsd = mutate(dcsd, 
            st = str_pad(STATEA,2,pad = '0'),
            cnty = str_pad(COUNTYA,3,pad = '0'),
            csd = str_pad(COUSUBA,5,pad = '0'),
            csd_LSAD = str_pad(LSADC,2,pad = '0'))

# Check how the different LSAD + FUNCSTAT combos look
dcsd %>% count(LSADC,FUNCSTAT)

# 0 F - 132
dcsd %>% filter(LSADC == '0') %>% select(STATE,COUNTY,COUSUB,U7B001) #Lots of counties with "no subdivisions defined", including many in MI
dcsd %>% filter(STATE == 'Michigan',COUNTY == 'Grand Traverse County')  %>% count() # But there are still 15 subdivisions in Grand Traverse County - this matches Wikipedia (https://en.wikipedia.org/wiki/Grand_Traverse_County,_Michigan#Communities) 
dcsd %>% filter(STATE == 'Michigan',COUNTY == 'Grand Traverse County')  %>% select(STATE,COUNTY,COUSUB,U7B001) # But there are still 15 subdivisions in Grand Traverse County 
# Almost all of these are cases where there is no residual population in the subdivision. Exceptions are Princeton NJ (which is captured in the places file), Cleveland NC, and Arlington VA

# 43 A - towns - what we want
dcsd %>% filter(LSADC == '43',FUNCSTAT == 'A') %>% count(STATE) #New England, NY, NJ, Wisconsin. Expected and what we want

# 44 - G - townships in Iowa that don't have powers
dcsd %>% filter(LSADC == '44',FUNCSTAT == 'G') %>% count(STATE) #Per web research, IA townships can't raise taxes

# Keep everything that's A except for Plantations in Maine, which have very limited powers
ccsd = filter(dcsd, FUNCSTAT == 'A',LSADC != '39') %>% select(st,cnty,csd,csd_name = NAME,csd_LSAD ,csd_AREALAND = AREALAND,csd_pop = U7B001) #GISJOIN,GEOID, 

dim(ccsd) #16190


## Definition 2: As above but only CSDs that have meaningful powers - no townships except NJ, no towns in WI (which are basically townships)
# Still including charter townships in MI, which are more like cities

# Which are townships in NJ
dcsd %>% filter(csd_LSAD %in% c(44,45))  %>% count(STATE, STATEA,csd_LSAD) # In NJ townships have powers, so include them

dcsd %>% filter(st == '55' )  %>% count(STATE,csd_LSAD,FUNCSTAT) # All Wisconsin CSDs with FUNCSTAT = A are towns, which we want in loc but not loc2

bcsd = ccsd %>% filter(st != '55', !(csd_LSAD %in% c(44,45)) | st == '34' | st == '42') # From loc2 we drop all townships except in NJ and PA (and charter townships in MI, which are LSAD 49). We also drop towns in WI (= all CSDs with FUNCSTAT = A in WI)
dim(bcsd) #4194 


## Definition 3: School districts ##
# Following the "best" school district from the Census geocorr engine https://mcdc.missouri.edu/applications/docs/maggot2014.html#scldists
# Unified first, then elementary, then secondary

usd = read.csv('nonproprietary_data/nhgis/nhgis0056_csv/nhgis0056_ds248_2020_sd_uni.csv')
usd = mutate(usd, 
          st = str_pad(STATEA,2,pad = '0'),
          cnty = str_pad(COUNTYA,3,pad = '0'),
          usd = str_pad(SDUNIA,5,pad = '0'))

# Keep where FUNCSTAT = E, active government with special functions
table(usd$FUNCSTAT,useNA = 'a') # Almost all E, 47 F which are fictitious for statistical purposes. "Remainder of Arizona" etc. Keep only E
usd = filter(usd, FUNCSTAT == 'E') %>% select(st,usd, usd_name = SDUNI, usd_AREALAND = AREALAND, usd_pop = U7B001)

# Elementary school districts
esd = read.csv('nonproprietary_data/population_2020/nhgis0056_csv/nhgis0056_ds248_2020_sd_elm.csv')
esd = mutate(esd, 
             st = str_pad(STATEA,2,pad = '0'),
             cnty = str_pad(COUNTYA,3,pad = '0'),
             esd = str_pad(SDELMA,5,pad = '0'))

# Keep where FUNCSTAT = E, active government with special functions
table(esd$FUNCSTAT,useNA = 'a') # Almost all E, 29 F which are ficitious for statistical purposes. "Remainder of Arizona" etc. Keep only E
esd = filter(esd, FUNCSTAT == 'E') %>% select(st,esd, esd_name = SDELM, esd_AREALAND = AREALAND, esd_pop = U7B001)

# Secondary school districts
ssd = read.csv('nonproprietary_data/population_2020/nhgis0056_csv/nhgis0056_ds248_2020_sd_sec.csv')
ssd = mutate(ssd, 
             st = str_pad(STATEA,2,pad = '0'),
             cnty = str_pad(COUNTYA,3,pad = '0'),
             ssd = str_pad(SDSECA,5,pad = '0'))

# Keep where FUNCSTAT = E, active government with special functions
table(ssd$FUNCSTAT,useNA = 'a') # About 1/4 are F in this case
filter(ssd, FUNCSTAT == 'F') # Lot of them are parts of USDs. Maybe case where there are some separate ESDs but the secondary schools go to the USD? Drop these 
ssd = filter(ssd, FUNCSTAT == 'E') %>% select(st,ssd, ssd_name = SDSEC, ssd_AREALAND = AREALAND, ssd_pop = U7B001)




### Part 2: Bring in blocks and assign to fiscal units ###

# Going to merge on the CSDs and the places
# Blocks in a Place assigned to that place (have checked and NY villages are Places while towns are CSDs - we want villages to take precedence)
# Blocks in a CSD assigned to that CSD
# Everything else assigned to the county


# Make clean state, place, county, csd codes
cblock = mutate(dblock,
                st = str_pad(STATEA,2,pad = '0'),
                cnty = str_pad(COUNTYA,3,pad = '0'),
                csd = str_pad(COUSUBA,5,pad = '0'),
                place = str_pad(PLACEA,5,pad = '0'),
                fips = str_pad(GEOCODE,15,pad = '0'),
                usd = str_pad(SDUNIA,5,pad = '0'),
                esd = str_pad(SDELMA,5,pad = '0'),
                ssd = str_pad(SDSECA,5,pad = '0'),
                cnty_LSAD = '06', # All counties are 06
                pop = U7B001) %>% select(GISJOIN,GEOID,fips,st,STATE,cnty,COUNTY,cnty_LSAD,csd,place,usd,esd,ssd,AREALAND,pop)

# Make a combined county + csd identifier for those cases
cblock$cnty_csd = paste(cblock$cnty,cblock$csd,sep = '_')

rm(dblock)

## Def 1 - smallest fiscal unit ##

# Merge on the Places
cblock = left_join(cblock, cplaces, by = c('st','place') )
head(cblock)
sum(!is.na(cblock$place_name)) #3801647 blocks inside a Place

# Now merge on CSDs v1
cblock = left_join(cblock, ccsd, by = c('st','cnty','csd') )
head(cblock)
sum(!is.na(cblock$csd_name)) #1972780 blocks inside a CSD
sum(!is.na(cblock$csd_name) & is.na(cblock$place_name)) #1567273 blocks inside a CSD and not inside a place
sum(!is.na(cblock$csd_name) & !is.na(cblock$place_name)) #405507 blocks inside a CSD and inside a place

## Now make one single identifier equal to the place where that exists, CSD where place doesn't exist, county otherwise
cblock = mutate(cblock,
                loc = case_when(
                  !is.na(place_name) ~ place,
                  is.na(place_name) & !is.na(csd_name) ~ cnty_csd,
                  is.na(place_name) & is.na(csd_name) ~ cnty
                ),
                loc_name = case_when(
                  !is.na(place_name) ~ place_name,
                  is.na(place_name) & !is.na(csd_name) ~ csd_name,
                  is.na(place_name) & is.na(csd_name) ~ COUNTY
                ),
                loc_lsad = case_when(
                  !is.na(place_name) ~ place_LSAD,
                  is.na(place_name) & !is.na(csd_name) ~ csd_LSAD,
                  is.na(place_name) & is.na(csd_name) ~ cnty_LSAD  #This isn't technically correct - Louisiana should be 15, Alaska 04/05, others other ones
                ),
                loc_type = case_when(
                  !is.na(place_name) ~ "Place",
                  is.na(place_name) & !is.na(csd_name) ~ "CSD",
                  is.na(place_name) & is.na(csd_name) ~ "County"
                )
                )

## Definition 2 - fiscal units but not townships ##

# Merge on CSDs v2
cblock = left_join(cblock, bcsd, by = c('st','cnty','csd'),suffix = c('','_v2') )
head(cblock)
sum(!is.na(cblock$csd_name_v2)) #Only 519131 blocks inside a CSD with this definition
sum(!is.na(cblock$csd_name_v2) & is.na(cblock$place_name)) #483673 blocks inside a CSD and not inside a place with this definition
sum(!is.na(cblock$csd_name_v2) & !is.na(cblock$place_name)) #35458 blocks inside a CSD and inside a place with this definition

cblock = mutate(cblock,
                loc2 = case_when(
                  !is.na(place_name) ~ place,
                  is.na(place_name) & !is.na(csd_name_v2) ~ cnty_csd,
                  is.na(place_name) & is.na(csd_name_v2) ~ cnty
                ),
                loc2_name = case_when(
                  !is.na(place_name) ~ place_name,
                  is.na(place_name) & !is.na(csd_name_v2) ~ csd_name_v2,
                  is.na(place_name) & is.na(csd_name_v2) ~ COUNTY
                ),
                loc2_lsad = case_when(
                  !is.na(place_name) ~ place_LSAD,
                  is.na(place_name) & !is.na(csd_name_v2) ~ csd_LSAD,
                  is.na(place_name) & is.na(csd_name_v2) ~ cnty_LSAD  #This isn't technically correct - Louisiana should be 15, Alaska 04/05, others other ones, but for our purposes better to keep as one
                ),
                loc2_type = case_when(
                  !is.na(place_name) ~ "Place",
                  is.na(place_name) & !is.na(csd_name_v2) ~ "CSD",
                  is.na(place_name) & is.na(csd_name_v2) ~ "County"
                )
)

## Definition 3 - school districts ##

# Merge on the USDs
cblock = left_join(cblock, usd, by = c('st','usd') )
head(cblock)
sum(!is.na(cblock$usd_name)) #7681162 blocks inside a USD - about 94%

# Now merge on ESDs
cblock = left_join(cblock, esd, by = c('st','esd') )
head(cblock)
sum(!is.na(cblock$esd_name)) #488273 blocks inside a ESD
sum(!is.na(cblock$esd_name) & is.na(cblock$usd_name)) #All of them are not in a USD

# Now merge on SSDs
cblock = left_join(cblock, ssd, by = c('st','ssd') )
head(cblock)
sum(!is.na(cblock$ssd_name)) #420472 blocks inside a ssd
sum(!is.na(cblock$ssd_name) & is.na(cblock$usd_name) & is.na(cblock$esd_name)) #All but 120 of them are in ESDs

sum(!is.na(cblock$esd_name) & is.na(cblock$usd_name) & is.na(cblock$ssd_name)) #67921 ESD blocks are not in SSDs as well
sum(!is.na(cblock$esd_name) & is.na(cblock$usd_name) & !is.na(cblock$ssd_name)) #420352 ESD blocks are in SSDs as well

sum(is.na(cblock$esd_name) & is.na(cblock$usd_name) & is.na(cblock$ssd_name)) #5400 blocks not in a school district (probably the FUNCSTAT F ones)


# There are scattered blocks that aren't assigned to any school districts
test = filter(cblock, is.na(esd_name),is.na(ssd_name),is.na(usd_name))
sum(test$pop) #82527
table(test$STATE) #Scattered across states, lots of AZ (which matches FUNCSTAT = F above)
table(test$loc2_type) # Mostly counties, some incorporated

# Put these all in one residual category per state

cblock = mutate(cblock,
                school = case_when(
                  !is.na(usd_name) ~ usd,
                  is.na(usd_name) & !is.na(esd_name) ~ esd,
                  is.na(usd_name) & is.na(esd_name) & !is.na(ssd_name) ~ ssd,
                  is.na(usd_name) & is.na(esd_name) & is.na(ssd_name)  ~ st
                ),
                school_name = case_when(
                  !is.na(usd_name) ~ usd_name,
                  is.na(usd_name) & !is.na(esd_name) ~ esd_name,
                  is.na(usd_name) & is.na(esd_name) & !is.na(ssd_name) ~ ssd_name,
                  is.na(usd_name) & is.na(esd_name) & is.na(ssd_name)  ~ paste(STATE,'remainder')
                ),
                school_type = case_when(
                  !is.na(usd_name) ~ 'USD',
                  is.na(usd_name) & !is.na(esd_name) ~ 'ESD',
                  is.na(usd_name) & is.na(esd_name) & !is.na(ssd_name) ~ 'SSD',
                  is.na(usd_name) & is.na(esd_name) & is.na(ssd_name)  ~ 'State residual'
                )
)


# Now do some checks on population and area
# Should be the case that the sum of block population equals Place population
checkblock = cblock %>% group_by(st,loc,loc_type) %>% mutate(pop_calc = sum(pop),acalc = sum(AREALAND))
stopifnot(checkblock$place_pop == checkblock$pop_calc | is.na(checkblock$place_pop)) # Everywhere that's in a place has the same calculated population as Place population directly from NHGIS

# Check CSD
checkblock$csdcalc_pop = checkblock$csd_pop - checkblock$pop_calc

sum(checkblock$csdcalc_pop < 0,na.rm = T) #75032 cases where CSD pop from NHGIS is smaller than that calculated from blocks
head(filter(checkblock, csdcalc_pop < 0)) %>% data.frame()
filter(checkblock, csdcalc_pop < 0) %>% ungroup() %>% count(STATE) # Mostly in IL, IN, KS, OH, some others
filter(checkblock, csdcalc_pop < 0) %>% ungroup() %>% count(loc_type) # All instances where they are in places. These are places that cover more than one CSD or perhaps also more than one County
filter(checkblock, csdcalc_pop < 0, STATE == 'New York') %>% select(STATE, place_name, csd_name, pop, csd_pop, place_pop, COUNTY,  pop_calc) %>% head() # First one is Gowanda, NY, which is split across two counties https://en.wikipedia.org/wiki/Gowanda%2C_New_York

# Drop the check data
rm(checkblock)


### Part 3: Make clean versions to export ###

# Add on different metro area definitions 

## 2020 CBSAs
cbsa = read.csv('nonproprietary_data/msa_crosswalks/list1_2020.csv')

# Clean up CBSA
cbsa = mutate(cbsa, st = str_pad(FIPS.State.Code, 2,pad = '0'),
              cnty = str_pad(FIPS.County.Code, 3, pad = '0'),cbsa = str_pad(CBSA.Code,5,pad = '0'),
              csa = str_pad(CSA.Code, 3,pad = '0'),
              micro = Metropolitan.Micropolitan.Statistical.Area == 'Micropolitan Statistical Area') %>% select(st,cnty,cbsa,csa,micro)

# Bring in 1990 CZs
czs = read.csv('nonproprietary_data/msa_crosswalks/cty_cz_st_crosswalk.csv')
czs = mutate(czs,
             cz = str_pad(cz, 5,pad = '0'),
             st = str_pad(state_fips,2,pad = '0'),
             fips = str_pad(cty,5,pad = '0'),
             cnty = str_sub(fips,3,5)) %>% select(st,cnty,cz)


## Fix missing counties - cases where county name/fips in 2020 Census
## Differs from that in 1990 CZ data - taking info from https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.1990.html#list-tab-957819518

cblock = mutate(cblock, czcounty = case_when(
  st == '08' & cnty == '014' ~ '013' , #Broomfield county from Boulder County
  st == '12' & cnty == '086' ~ '025' ,  # Miami Dade county was just Dade county
  st == '46' & cnty == '102' ~ '113', # Oglala Lakota County from Shannon County
  st == '15' & cnty == '005' ~ '009', # Treat Kalawao county as part of Maui county since not in CZ crosswalk
  
  # Lots of Alaska changes
  st == '02' & cnty == '063' ~ '261',
  st == '02' & cnty == '066' ~ '261',
  st == '02' & cnty == '068' ~ '290',
  st == '02' & cnty == '105' ~ '231',
  st == '02' & cnty == '158' ~ '261',
  st == '02' & cnty == '195' ~ '280',
  st == '02' & cnty == '198' ~ '201',
  st == '02' & cnty == '230' ~ '231',
  st == '02' & cnty == '275' ~ '280',
  st == '02' & cnty == '282' ~ '231', 
  
  T ~ cnty
  
  )
)


cblock = left_join(cblock, cbsa, by = c('st','cnty'))
cblock = left_join(cblock, czs, by = c('st','czcounty' = 'cnty'))

# Check merge 
sum(is.na(cblock$cbsa)) #1585337, but that includes places that aren't assigned to CBSAs
sum(is.na(cblock$cz)) #41987 - But these are all in PR
stopifnot(!is.na(cblock$cz) | cblock$st == '72')

test2 = filter(cblock, is.na(cz))
test2 %>% group_by(STATE) %>% count() # All PR


# Flag for metro areas (vs micro)
cblock = mutate(cblock, 
                metro = !is.na(cbsa) & micro == F
                )



## Export block level crosswalk with clean FU identifiers, no Puerto Rico
exp_block = cblock %>% filter(st != '72') %>% select(GISJOIN,GEOID, fips,st, STATE, cnty,cnty_name = COUNTY,place,place_name, place_LSAD, cnty_csd, csd_name,csd_LSAD,loc,loc_name,loc_type,loc_lsad,loc2,loc2_name,loc2_type,loc2_lsad,school,school_name,school_type,pop,AREALAND,cz,cbsa,csa,micro,metro)
rm(cblock)

save(exp_block,file = 'intermediate_data/fiscal_units/fu_block_crosswalk.Rdata')


# Which places are in both CSDs that are functional governments and places? 
filter(exp_block, loc_type == 'Place', !is.na(csd_LSAD), !(csd_LSAD %in% c(44,45)) | (csd_LSAD == 44 & st == '34')) %>% count(STATE,st) #CT, MI, NY, VT only, almost all NY

filter(exp_block, loc_type == 'Place', !is.na(csd_LSAD),!(csd_LSAD %in% c(44,45)), st == '26')
# Michigan - some villages are inside charter townships

filter(exp_block, loc_type == 'Place', !is.na(csd_LSAD),!(csd_LSAD %in% c(44,45)), st == '50')
# Vermont - both towns and villages show up, some incorporated villages are inside towns

filter(exp_block, loc_type == 'Place', !is.na(csd_LSAD),!(csd_LSAD %in% c(44,45)), st == '09')
# Connecticut has boroughs that are separate from the towns they are within

filter(exp_block, loc_type == 'Place', !is.na(csd_LSAD),!(csd_LSAD %in% c(44,45)), st == '36')
# Most are NY, where villages can be within towns
