Water quality perception vs. reality in California

In this notebook, we pull together the data to make the infographics in our final DP3 project, “Who Trusts the Tap?” located here: https://jenny7hi.github.io/desinst215/. If you have any questions about this code please contact Sabine Loos at sloos@stanford.edu.

# load libraries
suppressMessages(library(rgdal))
suppressMessages(library(rgeos))
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
suppressMessages(library(dplyr))
suppressMessages(library(ggrepel))
suppressMessages(library(extrafont))

# load fonts
suppressMessages(font_import(pattern = "OpenSans-Regular", prompt = F))
suppressMessages(loadfonts(device = "win"))
# load functions
source("functions/sp2gg.R") #read in function to convert to ggplot polygon
source("functions/PlottingFunctions.R") #themes for plots

First we have to read in the data. This includes:

Counties of California shapefile and their demographics, downloaded from www.simplyanalytics.com
The water quality perception of each county in 2015, made from our perception model (see code at bottom)
Water bottle sales per county for 2015, from Nielsen reportings
Violations per county, from XXX

CA_demog <- readOGR("data/raw_data/Counties_CA_Income_water")
CA_perception <- readRDS("data/output_data/CA_county_perception_052419.rds")
wb_dat <- read.table(file = 'data/raw_data/movement_2015/merged_movement_2015.tsv', sep = '\t', header = TRUE)
viol_dat <- read.csv(file = "data/raw_data/Violations_County.csv", header = T)

Now we have to rename columns and merge all of these together. First we merge the county demographics and county perception into our working dataframe: CA_counties

# view names of county demography data and rename
names(CA_demog) <- c("FIP", "County", "p_house2010", "p_over100k", "n_house", "p_poverty", "p_watertot", "p_wateravg", "n_HDD_year")
# make all FIP id's 4 numbers (without leading 0)
CA_demog@data$FIP <-as.factor(substr(as.character(CA_demog@data$FIP), 2,5))
CA_perception$FIP <-as.factor(substr(as.character(CA_perception$FIP), 2,5))
# merge demography data with perception data to make working county dataframe
CA_counties <- merge(CA_demog, CA_perception, by = "FIP")
# add violation data
viol_dat$FIPS <- factor(viol_dat$FIPS)
CA_counties <- merge(CA_counties, viol_dat, by.x = "FIP", by.y = "FIPS")

The water bottle sales are each individual sale for 2015. We have to merge them per county and calculate the total number of sales, total volume, sales per capita, and volume per capita before merging with the county shapefile

# group the water bottle data by counties, then calculate total volume and total sales
wb_dat_grouped <- wb_dat %>%
  group_by(fips) %>%
  summarise(total_sales_2015 = sum(total_sales, na.rm = T),
            total_vol_2015 = sum(total_vol, na.rm = T))
# merge with county shapefile
CA_counties <- merge(CA_counties, wb_dat_grouped, by.x = "FIP", by.y = "fips", all.x = T)
# calculate the sales per capita and volume per capita by dividing by population
CA_counties$sales_cap <- CA_counties$total_sales_2015/CA_counties$POPULATION_2017
CA_counties$vol_cap <- CA_counties$total_vol_2015/CA_counties$POPULATION_2017
head(CA_counties@data)

Mapping perception vs. reality

Before we map the data, we have to turn the sp dataframe (the shapefile) into a regular dataframe that ggplot can use to make polygons

CA_counties_gg <- sp2gg(CA_counties)

Our first map is of water quality perception. Here we plot the percentage of the population who think the water is safe

ggplot(CA_counties_gg, aes(long, lat, group=group)) + 
  geom_polygon(aes(fill = p_safe*100),colour = "transparent") + 
  scale_fill_distiller("Percent of population \nwho think water is safe",palette = "Reds")+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = c(0.8,0.8),legend.text = element_text(family="Open Sans"),
                                                       legend.title = element_text(family="Open Sans"))

Now, we want to compare the perception to reality. So we plot the percentage of the population who is affected by at least 1 violation

ggplot(CA_counties_gg, aes(long, lat, group=group)) + 
  geom_polygon(aes(fill = as.numeric(p_popaffected_1violation)),colour = "transparent") + 
  scale_fill_distiller("Percent of population \naffected by at least one \nviolation", palette = "Reds", direction = -1, trans = "reverse")+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = c(0.8,0.8),legend.text = element_text(family="Open Sans"),
                                                       legend.title = element_text(family="Open Sans"))

Clearly, perception and reality do not necessarily align. Let’s dig into that further by looking at three counties:

ind_4counties <- which(CA_counties@data$FIP %in% c("6077","6005", "6085", "6039"))
CA_counties@data[ind_4counties,c("FIP","County.x","p_safe","p_popaffected_1violation")]

Mapping safety perception for San Joaquin

ind_4counties <- which(CA_counties_gg$FIP %in% c("6077"))
ggplot() + 
  geom_polygon(data = CA_counties_gg[-ind_4counties,], aes(long, lat, group=group,fill = p_safe*100),alpha = 0.25,colour = "transparent") + 
  geom_polygon(data = CA_counties_gg[ind_4counties,],aes(long, lat, group=group,
                                                         fill = p_safe*100),colour = "white", size = 1) + 
  scale_fill_distiller("% of population who\nthinks water is safe",palette = "Reds")+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = "none")

Mapping violations for san joaquin

ggplot() + 
  geom_polygon(data = CA_counties_gg[-ind_4counties,], aes(long, lat, group=group,fill = as.numeric(p_popaffected_1violation)),alpha = 0.25,colour = "transparent") + 
  geom_polygon(data = CA_counties_gg[ind_4counties,],aes(long, lat, group=group,
                                                         fill = as.numeric(p_popaffected_1violation)),colour = "white", size = 1) + 
  scale_fill_distiller("% of population who\nthinks water is safe",palette = "Reds", direction = 1)+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = "none")

Mapping safety for other 3 counties

ind_4counties <- which(CA_counties_gg$FIP %in% c("6005", "6085", "6039"))
ggplot() + 
  geom_polygon(data = CA_counties_gg[-ind_4counties,], aes(long, lat, group=group,fill = p_safe*100),alpha = 0.25,colour = "transparent") + 
  geom_polygon(data = CA_counties_gg[ind_4counties,],aes(long, lat, group=group,
                                                         fill = p_safe*100),colour = "white", size = 1) + 
  scale_fill_distiller("% of population who\nthinks water is safe",palette = "Reds")+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = "none")

Mapping violations for other three counties

ggplot() + 
  geom_polygon(data = CA_counties_gg[-ind_4counties,], aes(long, lat, group=group,fill = as.numeric(p_popaffected_1violation)),alpha = 0.25,colour = "transparent") + 
  geom_polygon(data = CA_counties_gg[ind_4counties,],aes(long, lat, group=group,
                                                         fill = as.numeric(p_popaffected_1violation)),colour = "white", size = 1) + 
  scale_fill_distiller("% of population who\nthinks water is safe",palette = "Reds", direction = 1)+
  coord_equal() + plotThemeMap(base_size = 16) + theme(legend.position = "none")

Understanding how perception affects consumption

Now we want to see if perception affects our actions. So we compare perception to spending on water bottles (we’d expect people who think the tap water to be unsafe to buy more water bottles)

Let’s take a look at the relationship between the water bottle spending and people’s perception

ind <- which.max(CA_counties@data$vol_cap)
ggplot(CA_counties@data[-ind,], aes(y = vol_cap,x = p_safe*100))+
  geom_point(na.rm=TRUE)+
  stat_smooth(method = "lm", alpha = 1, se = F, col = "turquoise4", na.rm=TRUE)+
  labs(y = "Volume of water bottles sold per person, 2015 (oz)",
       x = "Percent of population who think water is safe")+
  scale_color_manual(values = c("tomato3","black"))+
  plotThemeDesign()+theme(legend.position='none', axis.title = element_text(family="Open Sans"), axis.text = element_text(family="Open Sans"))

If we pull out our highighted counties…

ind_4counties <- which(CA_counties@data$FIP %in% c("6077", "6005", "6085", "6039"))
ind <- which.max(CA_counties@data$vol_cap)
CA_counties@data$county_simple <- stringr::str_split(as.character(CA_counties@data$County.x), " County", simplify = T)[,1]
CA_counties@data$highlight = "regular"; CA_counties@data$highlight[ind_4counties] = "highlight"
ggplot(CA_counties@data[-ind,], aes(y = vol_cap,x = p_safe*100))+
  geom_point(na.rm=TRUE, aes(color = highlight))+
  geom_text_repel(data = CA_counties@data[ind_4counties,],aes(label = county_simple),nudge_x = 0.2,nudge_y = 25,hjust=0,
                  direction = "y",family="Open Sans")+
  stat_smooth(method = "lm", alpha = 1, se = F, col = "turquoise4", na.rm=TRUE)+
  labs(
       y = "Volume of water bottles sold per capita, 2015 (oz)",
       x = "Percent of population who think water is safe")+
  scale_color_manual(values = c("tomato3","black"))+
  plotThemeDesign()+theme(legend.position='none', axis.title = element_text(family="Open Sans"), 
                          axis.text = element_text(family="Open Sans"))

How to track perception?

Okay, so now that we know that this is a marginal correlation between perception and consumption, how can cities track perception better (Without having to survey people all the time)

Here we explore google searches as a proxy for perception. Google search data is organized by metro. So we read in the search popularity data by metro area. Here we aggregate the searches for the following terms:

water quality
water purification
tap water
safe water
drinking water
water filter

search_dat <- read.csv(file = "data/raw_data/search_data.csv")
head(search_dat)

Now we compare perception vs. searches. There’s a trend! Cities may be able to track perception based on the google searches on these terms:

ggplot(search_dat, aes(x = average,y = p_safe*100))+
  geom_point()+
  labs(
       x = "Google search popularity",
       y = "Percent of population who think water is safe")+
  plotThemeDesign()+theme(legend.position='none', axis.title = element_text(family="Open Sans"), 
                          axis.text = element_text(family="Open Sans"))

Code for calculating perception

Based on data from the 2015 American Housing Survey (AHS), researchers at Amherst College and UCLA identified sociodemographic factors that influence individuals’ perception of tap water (Javidi, 2018). The Census Bureau releases individual responses to the questionnaire, which contains multiple demographic factors and a yes or no question on whether a respondent thinks their tap water is safe. If we knew the counties of each AHS respondent, we would be able to map these responses directly, however, the government does not release any personally identifiable data with these responses to protect individual’s privacy.

Instead, we developed our own model using the AHS data to estimate water quality perception for each county in California. In this model, we essentially build “fake counties” by sampling responses in the survey. We sample the AHS survey until the fake county matches the county’s true demographics on a set of sociodemographic characteristics that were shown in the report to influence an individual’s perception of water safety—things like ethnicity, homeownership, or education. Therefore, water quality perception in this map is based on these demographics.

This method is extremely similar to Monte Carlo simulation, since we are attempting to rebuild a moment of a distribution of county demographics through random sampling. Monte Carlo simulation can be flawed, because it is slow to converge and the final distribution may not match the true distribution (even if the moments do) (link for more information). This model provides a sufficient first pass at California’s perception of water quality (with some uncertainty), but could be improved in the future to include the uncertainty or reach convergence at a quicker rate. In the future, one could also explore using support vector machines to classify water quality perception at the county level.

print("test perception")
library(dplyr)
library(future)
library(doFuture)
setwd("/home/users/sloos/DP3-analysis")
# perception model for batch processing
CA_county_dat <- readRDS("data/output_data/CA_county_demog.rds")
AHS_perc_dat <- readRDS("data/output_data/AHS_perception_dat.rds")
# function to calculate the percentage of each demographic given an array
perc_calc <- function(array_samp){
  p_white <- (array_samp %>% filter(HHRACE ==1) %>% filter(HHSPAN == 2) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_hisp <- (array_samp %>% filter(HHRACE ==1) %>% filter(HHSPAN == 1) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_black <- (array_samp %>% filter(HHRACE ==2) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_other <- (array_samp %>% filter(HHRACE >= 3) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_female <- (array_samp %>% filter(HHSEX ==2) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_citizen <- (array_samp %>% filter(HHCITSHP !=5) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_hsgrad <- (array_samp %>% filter(HHGRAD %in% seq(39,47)) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  p_homeowner <- (array_samp %>% filter(TENURE ==1) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  # p_mobile <- (array_samp %>% filter(BLD ==1) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  # p_pubhs <- (array_samp %>% filter(HUDSUB == 1) %>% summarise(p = n()))[1,1]/nrow(array_samp)
  dat <- as.data.frame(cbind(p_white, p_hisp, p_black, p_other, p_female, p_citizen, p_hsgrad, p_homeowner))
  dat = dat*100
  return(dat)
}

registerDoFuture()
cl<- parallel::makeCluster(as.numeric(Sys.getenv("SLURM_NTASKS")))
future::plan(tweak(cluster, workers = cl))

system.time(
  p_safe <- foreach(i=1:nrow(CA_county_dat), .combine="rbind") %dopar% {
    real_county <- CA_county_dat[i,-c(1,10,11)]
    # build initial fake county (using 10 rows of the AHS survey)
    ind <- sample(nrow(AHS_perc_dat), 10, replace = F)
    d1 =1000
    j=1
    while (d1 > 30 && j < 5000) { # 5% error for each var
      fake_county1 <- perc_calc(AHS_perc_dat[ind,])
      # calculate distance between vectors
      d1 = dist(rbind(real_county, fake_county1), method = "euclidean")[1]
      # randomly add another row to vec 2
      ind2 <- sample(nrow(AHS_perc_dat),1)
      fake_county2 = perc_calc(AHS_perc_dat[append(ind,ind2),])
      d2 = dist(rbind(real_county, fake_county2), method = "euclidean")[1]
      if(d2 <d1){
        fake_county1 = fake_county2
        ind = append(ind,ind2)
      }
      j=j+1
    }
    # calculate the percentage who think water is safe
    p_safe = length(which(AHS_perc_dat[ind,]$WATSAFE ==1))/length(ind)
    d1_vec = d1
    dat = cbind(p_safe, d1_vec)
  }
)

p_safe <- data.frame(p_safe, row.names = row.names(CA_county_dat))
p_safe <- cbind(CA_county_dat, p_safe)
# save
date <- format(Sys.Date(), "%m%d%y")
saveRDS(p_safe, paste0("data/output_data/CA_county_perception_",date,".rds"))

References