natlownes · March 30, 2023 17:54
diff --git a/1973-crime-and-injustice-dataset.R b/1973-crime-and-injustice-dataset.R
 library(arrow)
 library(daoviz)
 library(dplyr)
 library(ggplot2)
 library(gt)
 library(haven)
 library(lubridate)
 library(purrr)
 library(snakecase)
 library(tidyverse)
 library(zeallot)

 ################################################################################
 # notes on the data from the Inquirer stories
 ################################################################################
 #
 #
 # from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18,
 # 1973:
 #
 #   The investigation and computer study centered on the cases of 1034 persons
 #   who were indicted during 1971 for at least one of the four major crimes of
 #   violence -- murder, rape, aggravated robbery and aggravated assault and
 #   battery
 #
 #   Using information culled from more than 10,000 court documents and 20,000
 #   pages of transcripts of court proceedings, Inquirer reporters traced from
 #   beginning to end -- from commission of the crime to trial and sentencing --
 #   the cases of the 1,034 defendants
 #
 #
 # from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973:
 # 
 #   The usual way that prosecutors and the courts maintain statistics --
 #   computing convictions and acquittals -- is by the individual defendant, but
 #   the Inquirer survey measured the outcome of cases from the viewpoint of the
 #   victim.
 #   
 #   Although there were 1,034 defendants in the study, the number of cases in
 #   the computer analysis totaled 1,374. The bulk of the difference represents
 #   those person who were indicted for two or more separate, major crimes
 #   against different victims.
 #
 #   The cases of the 1,034 defendants represent a 39 percent sampling of all
 #   persons indicted in 1971 for one of the four violent crimes.


 # load spss files
 inquirer_data_path <- daocore::get_data_path(
  '1973_inquirer_series/knight_s609_spss.por')

 inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE,
                                .name_repair = 'universal')

 column_names <- names(inky_df_src)
 # var labels are a descriptive name for the column
 var_labels <- labelled::var_label(inky_df_src)
 # val labels are like an enum datatype in a db
 val_labels <- labelled::val_labels(inky_df_src)

 # there are some fields in the data that are missing val labels but have what
 # can be parsed as val labels in the field name. examples:
 # "SEX OF VICTIM: 1-MALE,2-FEMALE"
 # "RACE:1-WHITE,2-BLACK,3-OTHER"
 might_be_enum_field = ".+(\\:|\\?)+(\\s)*1"

 # for columns that don't have val labels, try and parse out what should be val
 # labels from the column name. These columns follow the format of the `might_be_enum_field` regex above
 parse_val_labels_from_field_name <- function(s){
  parts <- stringr::str_split(s, '\\:|\\?')
  # toss out the field name, keep the enum-ish string
  vals <- purrr::pluck(parts, 1, 2)
  if(is.null(vals)){
    return(vals)
  }
  items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim)
  reduce(items, function(acc, v){
    sp <- as.list(strsplit(v, '-')[[1]])
    acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1))
    acc
  }, .init=c())
 }

 # the inky dataset columns are two digit year
 inky_dataset_date_format <- function(y,m,d){
  lubridate::make_date(y + 1900, m, d)
 }

 # makes a two digit year into a four digit year depending on the specified
 # cutoff. for example the year born column is two digits, if the two digit year
 # is gt cutoff 70, assume 1800
 inky_dataset_year <- function(y, cutoff=70) {
  dplyr::if_else(y > cutoff, y + 1800, y + 1900)
 }

 # this augments the set of labels that came with the dataset. some columns have
 # val labels we can use, but some that should have them are missing them. for
 # some of these columns we can use the value of the var_label for the column to
 # come up with the missing val labels. (defendant race is an example in the
 # dataset)
 labels <- lapply(column_names, function(l){
  key <- as.character(l)
  val <- purrr::pluck(var_labels, key, .default=l)
  label <- purrr::pluck(val_labels, key)
  if(!is.null(label)){
    return(label)
  }
  if(grepl(might_be_enum_field, val)){
    return(parse_val_labels_from_field_name(val))
  }
 })

 # make our labels list a named list using the column names (`V${integer}`)
 names(labels) <- column_names
 labelled::val_labels(inky_df_src) <- labels

 # some column names reference other column names using the "V${integer} style;
 # this function replaces any of those appearances with the readable column name
 # example: "DISPOSITION V94, V104, INTENT TO MURDER"
 # note that a returned referenced column may also have references of its own
 dereference_column_name <- function(col_name){
  refs <- unlist(stringr::str_match_all(col_name, 'V\\d+'))
  if(purrr::is_empty(refs)){
    return(col_name)
  }
  lapply(refs, function(ref){
    val <- dereference_column_name(purrr::pluck(var_labels, ref, .default=''))
    col_name <<- gsub(ref, val, col_name)
  })
  # all the referenced cols are disposition columns, remove only instances of
  # that word that have whitespace before it-- these occur within the concat'd
  # referenced column names
  return(gsub(' DISPOSITION', '', col_name))
 }


 # the codebook pdf says that in addition to values 0 thru 9 as defined here,
 # there's also life sentence "++" but it doesn't seem like that exists in the
 # dataset
 sentence_enum_to_day_range <- list(
  # sentence suspended
  list(ddays(0), ddays(0)),
  # Fines and costs
  list(ddays(0), ddays(0)),
  list(ddays(1), dmonths(3)),
  list(dmonths(4), dmonths(6)),
  list(dmonths(7), dmonths(12)),
  list(dmonths(13), dyears(2)),
  list(dyears(2) + ddays(1), dyears(5)),
  list(dyears(5) + ddays(1), dyears(10)),
  list(dyears(10) + ddays(1), dyears(20)),
  list(dyears(20) + ddays(1), dyears(Inf)),
  # life
  list(dyears(Inf), dyears(Inf))
 )

 sentence_min_bounds <- unlist(
  map(sentence_enum_to_day_range, function(x){x[1]}))
 sentence_max_bounds <- unlist(
  map(sentence_enum_to_day_range, function(x){x[2]}))

 # use a snake-cased version of the columns var labels so the fields are a little
 # more readable
 formatted_col_names <- lapply(column_names, function(e){
  val <- purrr::pluck(var_labels, as.character(e), .default=e)
  val <- dereference_column_name(val)
  # special case to handle the unlabeled columns which we'll ignore
  if(val==''){
    return(e)
  }
  snakecase::to_snake_case(val)
 })

 names(formatted_col_names) <- column_names

 snake_case_columns <- function(v){
  purrr::map_chr(v, function(s) {
    purrr::pluck(formatted_col_names, s)
  })
 }

 # some column names are abbreviated, expand those abbreviations in this function
 standard_column_names <- function(v){
  gsub('aggr_a_b', 'aggr_assault_and_battery', v)
 }

 # wrap up all our column renamings into one function
 inky_rename_columns <- function(df) {
  df |>
    dplyr::rename_with(snake_case_columns) |>
    dplyr::rename(all_of(renamings)) |>
    dplyr::rename_with(standard_column_names) 
 }

 # consolidates the set of date fields that exist as three columns and follow a
 # naming pattern into their own date columns and drops the three source columns
 datepart_fields_mutations <- function(df) {
  event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment', 
                   'trial', 'sentence')
  datepart_names <- c('year_of', 'month_of', 'day_of')
  # new columns we're creating
  new_field_names <- map(event_names, function(e){
    sym(paste0(e, '_date'))
  })
  
  drop_columns <- unname(unlist(map(event_names, function(e){
    lapply(datepart_names, function(d){
      paste0(d, '_', e)
    })
  })))
  
  operations <- map(event_names, function(e){
    args <- map(datepart_names, function(d){
      var_name <- paste0(d, "_", e)
      quo(purrr::pluck(!!sym(var_name), 1))
    })
    quo(inky_dataset_date_format(!!!args))
  })
  args <- setNames(
    operations,
    new_field_names
  )
  df |>
    dplyr::mutate(!!!args) |>
    dplyr::select(-all_of(drop_columns))
 }

 # a list of manual column renamings
 renamings <- c(
  "race_defendant" = "race_1_white_2_black_3_other",
  "prior_arrests" = "prior_arrests_1_yes_2_no",
  "transcript_available" = "transcript_1_yes_2_no_3_copied",
  "race_judge" = "race_of_judge_1_black_2_white",
  "jury_trial" = "jury_trial_1_yes_2_no",
  "had_transcript" = "was_there_a_transcript_1_yes_2_no",
  "party_of_judge" = "party_of_judge_1_dem_2_rep",
  "was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no",
  "race_of_victim" = "race_of_victim_1_white_2_black_3_other",
  "data_compiled_by" = "data_compiled_by_1_barlett_2_steele",
  "sex_of_victim" = "sex_of_victim_1_male_2_female",
  "sex_of_defendant" = "sex_of_defendant_1_male_2_female"
 )

 # do a buncha transforms to consolidate columns
 inky_df <- inky_df_src |>
  inky_rename_columns() |>
  # turns all the spss integers into their factor values
  haven::as_factor() |>
  # now that we've got some nicer column names and values to work with, add some
  # nicer data types
  dplyr::mutate(
    year_born = inky_dataset_year(year_born)
  ) |>
  dplyr::rowwise() |>
  datepart_fields_mutations() |>
  # incident date doesn't fit the pattern of the other fields as defined in
  # datepart_fields_mutations() so do that one manually
  dplyr::mutate(
    incident_date = inky_dataset_date_format(
      year_of_incident_last_2_digits[1], 
      month_of_incident_1_thru_12[1], 
      day_of_incident_1_thru_31[1])
  ) |>
  # drop these columns that were condensed into a single date column
  dplyr::select(-all_of(c(
    'year_of_incident_last_2_digits',
    'month_of_incident_1_thru_12',
    'day_of_incident_1_thru_31'
  ))) |>
  # drop these two columns which are unnamed and null
  dplyr::select(-all_of(c(
    'V67',
    'V138'
  )))

 col_names <- names(inky_df)
 disposition_cols <- names(inky_df[grepl('^disposition', col_names)])
 probation_cols <- names(inky_df[grepl('^probation', col_names)])
 min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)])
 max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)])
 multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)])

 sentencing_columns <- c(
  min_sentence_cols, 
  max_sentence_cols, 
  multi_sentence_cols)

 # for debugging-- collect all the names of columns that have a value, summarize
 # as one text field
 summary_column <- function(pattern, columns) {
  return(function(v){
    s <- stringr::str_sort(columns[!is.na(v)])
    paste0(gsub(pattern, '', s), collapse='||')
  })
 }

 summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary')

 inky_df <- inky_df |>
  rowwise() |>
  # for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row
  mutate(
    charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))),
    charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))),
    charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))),
    charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))),
    charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols)))
  )
  

 # create a dispositions table with three columns: unique_case_id, charge,
 # disposition_outcome
 # example row:
 # unique_case_id | charge             | disposition_outcome
 # ---------------|--------------------|--------------------
 # 7466801        | murder             | FND GLTY-LSSR OFFNS
 dispositions <- inky_df[c('unique_case_id', disposition_cols)] |>
  pivot_longer(
    cols = all_of(disposition_cols),
    names_to = c("charge"),
    values_to = c("disposition_outcome"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('disposition_', '', c)
        })
      }
    )
  ) |>
  filter(!is.na(disposition_outcome)) |>
  dplyr::distinct()
  
 # create a probations table with three columns: unique_case_id, charge,
 # probation
 # example table:
 # unique_case_id | charge             | probation
 # ---------------|--------------------|--------------------
 # 7466801        | murder             | YES-NO TIME
 # 7466802        | murder             | YES-WITH TIME
 # 
 # YES-NO TIME: yes, with no time in jail
 # YES-WITH TIME: yes, after time served
 # a YES-WITH TIME value can indicate that jail time was sentenced for another
 # charge in the same case. you can see these when joining on the sentences table
 # 
 # a note about probations, from the codebook pdf included with the dataset:
 #
 #   treat missing data cautiously! For example, a code of 99 (na) on the
 #   probation items may refer to 'not applicable', 'no probation', or 'missing
 #   information'. Not applicables can, with a bit of effort, be manipulated into
 #   a separate category. There is however, no viable means of separating the
 #   rest of the 99's into one category consisting of 'no probation' and another
 #   consisting of 'not ascertained' (which may include individuals who did or
 #   did no receive probation). In other words, do not assume that everyone in
 #   the residual category did not receive a probationary sentence
 #
 # with that caveat, this table is probably not useful for any aggregate analysis
 probations <- inky_df[c('unique_case_id', probation_cols)] |>
  pivot_longer(
    cols = all_of(probation_cols),
    names_to = c("charge"),
    values_to = c("probation"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('probation_', '', c)
        })
      }
    )
  ) |>
  dplyr::filter(!is.na(probation)) |>
  dplyr::distinct()

 # CPI index value, Dec 2022
 # https://fred.stlouisfed.org/release/tables?rid=10&eid=36163#
 # https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation
 # default values are current: Dec 2022, past: Dec 1971
 inflation_adjustment <- function(v, 
                                 current_idx_value=294.883, 
                                 past_index_value=43.300) {
  v * (current_idx_value / past_index_value)
 }

 bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] |>
  dplyr::rename(
    'amount'='bail_in_wob_999999_nominal_000001'
  ) |>
  # not sure what a null amount means, so remove  
  dplyr::filter(!is.na(amount)) |>
  mutate (
    # value is actually 99999, note 999999 as noted in original column label
    held_without_bail = if_else(amount == 99999, TRUE, FALSE),
    real_amount = if_else(amount == 99999, 
                          Inf,
                          inflation_adjustment(amount))
  ) |>
  dplyr::distinct()

 multi_sentences <- inky_df_src |>
  inky_rename_columns() |>
  mutate(
    across(
      all_of(multi_sentence_cols),
      haven::as_factor
    )
  )

 # a table describing concurrent sentencing (if any) by charge and unique_case_id
 # example table:
 # unique_case_id | charge              | concurrency
 # ---------------+---------------------+---------------------
 # 26792592       | assault_and_battery | YES-CONCURRENT TERMS
 # 26792592       | aggravated_robbery  | NO MULT SENT
 # 36052400       | attempted_larceny   | YES-BOTH
 multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] |>
  pivot_longer(
    cols = all_of(multi_sentence_cols),
    names_to = c("charge"),
    values_to = c("concurrency"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('multiple_sentence_', '', c)
        })
      }
    )
  ) |>
  filter(!is.na(concurrency))  |>
  dplyr::distinct()

 # transforms for a sentences table. in the wide version of the table, create
 # columns for the lower and upper bound (in seconds) of the sentencing and add a
 # description column by converting the wide column values into factors
 sentences_src <- inky_df_src |>
  inky_rename_columns() |>
  mutate(
    across(
      all_of(c(min_sentence_cols, max_sentence_cols)),
      .fns = list(
        lower_bound = function(v){
          sentence_min_bounds[v]
        },
        upper_bound = function(v){
          sentence_max_bounds[v]
        },
        description = haven::as_factor
      ),
      .names = "{.col}||{.fn}"
    )
  ) |>
  # drop the columns that have been converted into bounds and descriptions
  dplyr::select(-all_of(c(
    min_sentence_cols,
    max_sentence_cols
  ))) |>
  # rename the columns to a format where values are separated by '||'
  dplyr::rename_with(function(v){
    purrr::map_chr(v, function(s) {
      if(grepl('^minimum|^maximum', s)){
        return(sub('sentence_', 'sentence||', s))
      }
      s
    })
  }) 


 sentence_details_cols <- names(sentences_src[
  grepl('minimum_sentence|maximum_sentence',  names(sentences_src))])

 # sentences spec for pivot_longer
 # .name | .value | charge | sentence_type
 # example row:
 # minimum_sentence||murder||lower_bound | lower_bound | murder | minimum_sentence
 sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){
  c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\|\\|')[[1]])
  c(.name=e, .value=value, charge=charge, sentence_type=sentence_type)
 })

 sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows))

 # create our sentences table:
 # unique_case_id, sentence type, charge, lower_bound, upper_bound, description
 sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] |>
  tidyr::pivot_longer_spec(sentences_spec) |>
  # in our lengthening, we've created a lot of what are null rows so drop any
  # rows where description is na
  dplyr::filter(!is.na(description)) |>
  dplyr::distinct()


 # create a cases_inky table that's a little easier to digest: minus all the columns
 # that are now lengthened in different tables. 
 # note the the unique_case_id won't be unique in this table-- there can be
 # multiple rows if there were multiple charges
 cases_inky <- inky_df |>
  dplyr::select(-all_of(c(
    sentencing_columns,
    disposition_cols,
    probation_cols
  )))
	library(arrow)
	library(daoviz)
	library(dplyr)
	library(ggplot2)
	library(gt)
	library(haven)
	library(lubridate)
	library(purrr)
	library(snakecase)
	library(tidyverse)
	library(zeallot)

	################################################################################
	# notes on the data from the Inquirer stories
	################################################################################
	#
	#
	# from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18,
	# 1973:
	#
	# The investigation and computer study centered on the cases of 1034 persons
	# who were indicted during 1971 for at least one of the four major crimes of
	# violence -- murder, rape, aggravated robbery and aggravated assault and
	# battery
	#
	# Using information culled from more than 10,000 court documents and 20,000
	# pages of transcripts of court proceedings, Inquirer reporters traced from
	# beginning to end -- from commission of the crime to trial and sentencing --
	# the cases of the 1,034 defendants
	#
	#
	# from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973:
	#
	# The usual way that prosecutors and the courts maintain statistics --
	# computing convictions and acquittals -- is by the individual defendant, but
	# the Inquirer survey measured the outcome of cases from the viewpoint of the
	# victim.
	#
	# Although there were 1,034 defendants in the study, the number of cases in
	# the computer analysis totaled 1,374. The bulk of the difference represents
	# those person who were indicted for two or more separate, major crimes
	# against different victims.
	#
	# The cases of the 1,034 defendants represent a 39 percent sampling of all
	# persons indicted in 1971 for one of the four violent crimes.


	# load spss files
	inquirer_data_path <- daocore::get_data_path(
	'1973_inquirer_series/knight_s609_spss.por')

	inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE,
	.name_repair = 'universal')

	column_names <- names(inky_df_src)
	# var labels are a descriptive name for the column
	var_labels <- labelled::var_label(inky_df_src)
	# val labels are like an enum datatype in a db
	val_labels <- labelled::val_labels(inky_df_src)

	# there are some fields in the data that are missing val labels but have what
	# can be parsed as val labels in the field name. examples:
	# "SEX OF VICTIM: 1-MALE,2-FEMALE"
	# "RACE:1-WHITE,2-BLACK,3-OTHER"
	might_be_enum_field = ".+(\\:\|\\?)+(\\s)*1"

	# for columns that don't have val labels, try and parse out what should be val
	# labels from the column name. These columns follow the format of the `might_be_enum_field` regex above
	parse_val_labels_from_field_name <- function(s){
	parts <- stringr::str_split(s, '\\:\|\\?')
	# toss out the field name, keep the enum-ish string
	vals <- purrr::pluck(parts, 1, 2)
	if(is.null(vals)){
	return(vals)
	}
	items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim)
	reduce(items, function(acc, v){
	sp <- as.list(strsplit(v, '-')[[1]])
	acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1))
	acc
	}, .init=c())
	}

	# the inky dataset columns are two digit year
	inky_dataset_date_format <- function(y,m,d){
	lubridate::make_date(y + 1900, m, d)
	}

	# makes a two digit year into a four digit year depending on the specified
	# cutoff. for example the year born column is two digits, if the two digit year
	# is gt cutoff 70, assume 1800
	inky_dataset_year <- function(y, cutoff=70) {
	dplyr::if_else(y > cutoff, y + 1800, y + 1900)
	}

	# this augments the set of labels that came with the dataset. some columns have
	# val labels we can use, but some that should have them are missing them. for
	# some of these columns we can use the value of the var_label for the column to
	# come up with the missing val labels. (defendant race is an example in the
	# dataset)
	labels <- lapply(column_names, function(l){
	key <- as.character(l)
	val <- purrr::pluck(var_labels, key, .default=l)
	label <- purrr::pluck(val_labels, key)
	if(!is.null(label)){
	return(label)
	}
	if(grepl(might_be_enum_field, val)){
	return(parse_val_labels_from_field_name(val))
	}
	})

	# make our labels list a named list using the column names (`V${integer}`)
	names(labels) <- column_names
	labelled::val_labels(inky_df_src) <- labels

	# some column names reference other column names using the "V${integer} style;
	# this function replaces any of those appearances with the readable column name
	# example: "DISPOSITION V94, V104, INTENT TO MURDER"
	# note that a returned referenced column may also have references of its own
	dereference_column_name <- function(col_name){
	refs <- unlist(stringr::str_match_all(col_name, 'V\\d+'))
	if(purrr::is_empty(refs)){
	return(col_name)
	}
	lapply(refs, function(ref){
	val <- dereference_column_name(purrr::pluck(var_labels, ref, .default=''))
	col_name <<- gsub(ref, val, col_name)
	})
	# all the referenced cols are disposition columns, remove only instances of
	# that word that have whitespace before it-- these occur within the concat'd
	# referenced column names
	return(gsub(' DISPOSITION', '', col_name))
	}


	# the codebook pdf says that in addition to values 0 thru 9 as defined here,
	# there's also life sentence "++" but it doesn't seem like that exists in the
	# dataset
	sentence_enum_to_day_range <- list(
	# sentence suspended
	list(ddays(0), ddays(0)),
	# Fines and costs
	list(ddays(0), ddays(0)),
	list(ddays(1), dmonths(3)),
	list(dmonths(4), dmonths(6)),
	list(dmonths(7), dmonths(12)),
	list(dmonths(13), dyears(2)),
	list(dyears(2) + ddays(1), dyears(5)),
	list(dyears(5) + ddays(1), dyears(10)),
	list(dyears(10) + ddays(1), dyears(20)),
	list(dyears(20) + ddays(1), dyears(Inf)),
	# life
	list(dyears(Inf), dyears(Inf))
	)

	sentence_min_bounds <- unlist(
	map(sentence_enum_to_day_range, function(x){x[1]}))
	sentence_max_bounds <- unlist(
	map(sentence_enum_to_day_range, function(x){x[2]}))

	# use a snake-cased version of the columns var labels so the fields are a little
	# more readable
	formatted_col_names <- lapply(column_names, function(e){
	val <- purrr::pluck(var_labels, as.character(e), .default=e)
	val <- dereference_column_name(val)
	# special case to handle the unlabeled columns which we'll ignore
	if(val==''){
	return(e)
	}
	snakecase::to_snake_case(val)
	})

	names(formatted_col_names) <- column_names

	snake_case_columns <- function(v){
	purrr::map_chr(v, function(s) {
	purrr::pluck(formatted_col_names, s)
	})
	}

	# some column names are abbreviated, expand those abbreviations in this function
	standard_column_names <- function(v){
	gsub('aggr_a_b', 'aggr_assault_and_battery', v)
	}

	# wrap up all our column renamings into one function
	inky_rename_columns <- function(df) {
	df \|>
	dplyr::rename_with(snake_case_columns) \|>
	dplyr::rename(all_of(renamings)) \|>
	dplyr::rename_with(standard_column_names)
	}

	# consolidates the set of date fields that exist as three columns and follow a
	# naming pattern into their own date columns and drops the three source columns
	datepart_fields_mutations <- function(df) {
	event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment',
	'trial', 'sentence')
	datepart_names <- c('year_of', 'month_of', 'day_of')
	# new columns we're creating
	new_field_names <- map(event_names, function(e){
	sym(paste0(e, '_date'))
	})

	drop_columns <- unname(unlist(map(event_names, function(e){
	lapply(datepart_names, function(d){
	paste0(d, '_', e)
	})
	})))

	operations <- map(event_names, function(e){
	args <- map(datepart_names, function(d){
	var_name <- paste0(d, "_", e)
	quo(purrr::pluck(!!sym(var_name), 1))
	})
	quo(inky_dataset_date_format(!!!args))
	})
	args <- setNames(
	operations,
	new_field_names
	)
	df \|>
	dplyr::mutate(!!!args) \|>
	dplyr::select(-all_of(drop_columns))
	}

	# a list of manual column renamings
	renamings <- c(
	"race_defendant" = "race_1_white_2_black_3_other",
	"prior_arrests" = "prior_arrests_1_yes_2_no",
	"transcript_available" = "transcript_1_yes_2_no_3_copied",
	"race_judge" = "race_of_judge_1_black_2_white",
	"jury_trial" = "jury_trial_1_yes_2_no",
	"had_transcript" = "was_there_a_transcript_1_yes_2_no",
	"party_of_judge" = "party_of_judge_1_dem_2_rep",
	"was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no",
	"race_of_victim" = "race_of_victim_1_white_2_black_3_other",
	"data_compiled_by" = "data_compiled_by_1_barlett_2_steele",
	"sex_of_victim" = "sex_of_victim_1_male_2_female",
	"sex_of_defendant" = "sex_of_defendant_1_male_2_female"
	)

	# do a buncha transforms to consolidate columns
	inky_df <- inky_df_src \|>
	inky_rename_columns() \|>
	# turns all the spss integers into their factor values
	haven::as_factor() \|>
	# now that we've got some nicer column names and values to work with, add some
	# nicer data types
	dplyr::mutate(
	year_born = inky_dataset_year(year_born)
	) \|>
	dplyr::rowwise() \|>
	datepart_fields_mutations() \|>
	# incident date doesn't fit the pattern of the other fields as defined in
	# datepart_fields_mutations() so do that one manually
	dplyr::mutate(
	incident_date = inky_dataset_date_format(
	year_of_incident_last_2_digits[1],
	month_of_incident_1_thru_12[1],
	day_of_incident_1_thru_31[1])
	) \|>
	# drop these columns that were condensed into a single date column
	dplyr::select(-all_of(c(
	'year_of_incident_last_2_digits',
	'month_of_incident_1_thru_12',
	'day_of_incident_1_thru_31'
	))) \|>
	# drop these two columns which are unnamed and null
	dplyr::select(-all_of(c(
	'V67',
	'V138'
	)))

	col_names <- names(inky_df)
	disposition_cols <- names(inky_df[grepl('^disposition', col_names)])
	probation_cols <- names(inky_df[grepl('^probation', col_names)])
	min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)])
	max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)])
	multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)])

	sentencing_columns <- c(
	min_sentence_cols,
	max_sentence_cols,
	multi_sentence_cols)

	# for debugging-- collect all the names of columns that have a value, summarize
	# as one text field
	summary_column <- function(pattern, columns) {
	return(function(v){
	s <- stringr::str_sort(columns[!is.na(v)])
	paste0(gsub(pattern, '', s), collapse='\|\|')
	})
	}

	summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary')

	inky_df <- inky_df \|>
	rowwise() \|>
	# for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row
	mutate(
	charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))),
	charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))),
	charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))),
	charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))),
	charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols)))
	)


	# create a dispositions table with three columns: unique_case_id, charge,
	# disposition_outcome
	# example row:
	# unique_case_id \| charge \| disposition_outcome
	# ---------------\|--------------------\|--------------------
	# 7466801 \| murder \| FND GLTY-LSSR OFFNS
	dispositions <- inky_df[c('unique_case_id', disposition_cols)] \|>
	pivot_longer(
	cols = all_of(disposition_cols),
	names_to = c("charge"),
	values_to = c("disposition_outcome"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('disposition_', '', c)
	})
	}
	)
	) \|>
	filter(!is.na(disposition_outcome)) \|>
	dplyr::distinct()

	# create a probations table with three columns: unique_case_id, charge,
	# probation
	# example table:
	# unique_case_id \| charge \| probation
	# ---------------\|--------------------\|--------------------
	# 7466801 \| murder \| YES-NO TIME
	# 7466802 \| murder \| YES-WITH TIME
	#
	# YES-NO TIME: yes, with no time in jail
	# YES-WITH TIME: yes, after time served
	# a YES-WITH TIME value can indicate that jail time was sentenced for another
	# charge in the same case. you can see these when joining on the sentences table
	#
	# a note about probations, from the codebook pdf included with the dataset:
	#
	# treat missing data cautiously! For example, a code of 99 (na) on the
	# probation items may refer to 'not applicable', 'no probation', or 'missing
	# information'. Not applicables can, with a bit of effort, be manipulated into
	# a separate category. There is however, no viable means of separating the
	# rest of the 99's into one category consisting of 'no probation' and another
	# consisting of 'not ascertained' (which may include individuals who did or
	# did no receive probation). In other words, do not assume that everyone in
	# the residual category did not receive a probationary sentence
	#
	# with that caveat, this table is probably not useful for any aggregate analysis
	probations <- inky_df[c('unique_case_id', probation_cols)] \|>
	pivot_longer(
	cols = all_of(probation_cols),
	names_to = c("charge"),
	values_to = c("probation"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('probation_', '', c)
	})
	}
	)
	) \|>
	dplyr::filter(!is.na(probation)) \|>
	dplyr::distinct()

	# CPI index value, Dec 2022
	# https://fred.stlouisfed.org/release/tables?rid=10&eid=36163#
	# https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation
	# default values are current: Dec 2022, past: Dec 1971
	inflation_adjustment <- function(v,
	current_idx_value=294.883,
	past_index_value=43.300) {
	v * (current_idx_value / past_index_value)
	}

	bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] \|>
	dplyr::rename(
	'amount'='bail_in_wob_999999_nominal_000001'
	) \|>
	# not sure what a null amount means, so remove
	dplyr::filter(!is.na(amount)) \|>
	mutate (
	# value is actually 99999, note 999999 as noted in original column label
	held_without_bail = if_else(amount == 99999, TRUE, FALSE),
	real_amount = if_else(amount == 99999,
	Inf,
	inflation_adjustment(amount))
	) \|>
	dplyr::distinct()

	multi_sentences <- inky_df_src \|>
	inky_rename_columns() \|>
	mutate(
	across(
	all_of(multi_sentence_cols),
	haven::as_factor
	)
	)

	# a table describing concurrent sentencing (if any) by charge and unique_case_id
	# example table:
	# unique_case_id \| charge \| concurrency
	# ---------------+---------------------+---------------------
	# 26792592 \| assault_and_battery \| YES-CONCURRENT TERMS
	# 26792592 \| aggravated_robbery \| NO MULT SENT
	# 36052400 \| attempted_larceny \| YES-BOTH
	multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] \|>
	pivot_longer(
	cols = all_of(multi_sentence_cols),
	names_to = c("charge"),
	values_to = c("concurrency"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('multiple_sentence_', '', c)
	})
	}
	)
	) \|>
	filter(!is.na(concurrency)) \|>
	dplyr::distinct()

	# transforms for a sentences table. in the wide version of the table, create
	# columns for the lower and upper bound (in seconds) of the sentencing and add a
	# description column by converting the wide column values into factors
	sentences_src <- inky_df_src \|>
	inky_rename_columns() \|>
	mutate(
	across(
	all_of(c(min_sentence_cols, max_sentence_cols)),
	.fns = list(
	lower_bound = function(v){
	sentence_min_bounds[v]
	},
	upper_bound = function(v){
	sentence_max_bounds[v]
	},
	description = haven::as_factor
	),
	.names = "{.col}\|\|{.fn}"
	)
	) \|>
	# drop the columns that have been converted into bounds and descriptions
	dplyr::select(-all_of(c(
	min_sentence_cols,
	max_sentence_cols
	))) \|>
	# rename the columns to a format where values are separated by '\|\|'
	dplyr::rename_with(function(v){
	purrr::map_chr(v, function(s) {
	if(grepl('^minimum\|^maximum', s)){
	return(sub('sentence_', 'sentence\|\|', s))
	}
	s
	})
	})


	sentence_details_cols <- names(sentences_src[
	grepl('minimum_sentence\|maximum_sentence', names(sentences_src))])

	# sentences spec for pivot_longer
	# .name \| .value \| charge \| sentence_type
	# example row:
	# minimum_sentence\|\|murder\|\|lower_bound \| lower_bound \| murder \| minimum_sentence
	sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){
	c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\\|\\\|')[[1]])
	c(.name=e, .value=value, charge=charge, sentence_type=sentence_type)
	})

	sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows))

	# create our sentences table:
	# unique_case_id, sentence type, charge, lower_bound, upper_bound, description
	sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] \|>
	tidyr::pivot_longer_spec(sentences_spec) \|>
	# in our lengthening, we've created a lot of what are null rows so drop any
	# rows where description is na
	dplyr::filter(!is.na(description)) \|>
	dplyr::distinct()


	# create a cases_inky table that's a little easier to digest: minus all the columns
	# that are now lengthened in different tables.
	# note the the unique_case_id won't be unique in this table-- there can be
	# multiple rows if there were multiple charges
	cases_inky <- inky_df \|>
	dplyr::select(-all_of(c(
	sentencing_columns,
	disposition_cols,
	probation_cols
	)))