Last active
March 30, 2023 17:54
-
-
Save natlownes/34a67777c9756b95efe40ffa33139542 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(arrow) | |
library(daoviz) | |
library(dplyr) | |
library(ggplot2) | |
library(gt) | |
library(haven) | |
library(lubridate) | |
library(purrr) | |
library(snakecase) | |
library(tidyverse) | |
library(zeallot) | |
################################################################################ | |
# notes on the data from the Inquirer stories | |
################################################################################ | |
# | |
# | |
# from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18, | |
# 1973: | |
# | |
# The investigation and computer study centered on the cases of 1034 persons | |
# who were indicted during 1971 for at least one of the four major crimes of | |
# violence -- murder, rape, aggravated robbery and aggravated assault and | |
# battery | |
# | |
# Using information culled from more than 10,000 court documents and 20,000 | |
# pages of transcripts of court proceedings, Inquirer reporters traced from | |
# beginning to end -- from commission of the crime to trial and sentencing -- | |
# the cases of the 1,034 defendants | |
# | |
# | |
# from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973: | |
# | |
# The usual way that prosecutors and the courts maintain statistics -- | |
# computing convictions and acquittals -- is by the individual defendant, but | |
# the Inquirer survey measured the outcome of cases from the viewpoint of the | |
# victim. | |
# | |
# Although there were 1,034 defendants in the study, the number of cases in | |
# the computer analysis totaled 1,374. The bulk of the difference represents | |
# those person who were indicted for two or more separate, major crimes | |
# against different victims. | |
# | |
# The cases of the 1,034 defendants represent a 39 percent sampling of all | |
# persons indicted in 1971 for one of the four violent crimes. | |
# load spss files | |
inquirer_data_path <- daocore::get_data_path( | |
'1973_inquirer_series/knight_s609_spss.por') | |
inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE, | |
.name_repair = 'universal') | |
column_names <- names(inky_df_src) | |
# var labels are a descriptive name for the column | |
var_labels <- labelled::var_label(inky_df_src) | |
# val labels are like an enum datatype in a db | |
val_labels <- labelled::val_labels(inky_df_src) | |
# there are some fields in the data that are missing val labels but have what | |
# can be parsed as val labels in the field name. examples: | |
# "SEX OF VICTIM: 1-MALE,2-FEMALE" | |
# "RACE:1-WHITE,2-BLACK,3-OTHER" | |
might_be_enum_field = ".+(\\:|\\?)+(\\s)*1" | |
# for columns that don't have val labels, try and parse out what should be val | |
# labels from the column name. These columns follow the format of the `might_be_enum_field` regex above | |
parse_val_labels_from_field_name <- function(s){ | |
parts <- stringr::str_split(s, '\\:|\\?') | |
# toss out the field name, keep the enum-ish string | |
vals <- purrr::pluck(parts, 1, 2) | |
if(is.null(vals)){ | |
return(vals) | |
} | |
items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim) | |
reduce(items, function(acc, v){ | |
sp <- as.list(strsplit(v, '-')[[1]]) | |
acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1)) | |
acc | |
}, .init=c()) | |
} | |
# the inky dataset columns are two digit year | |
inky_dataset_date_format <- function(y,m,d){ | |
lubridate::make_date(y + 1900, m, d) | |
} | |
# makes a two digit year into a four digit year depending on the specified | |
# cutoff. for example the year born column is two digits, if the two digit year | |
# is gt cutoff 70, assume 1800 | |
inky_dataset_year <- function(y, cutoff=70) { | |
dplyr::if_else(y > cutoff, y + 1800, y + 1900) | |
} | |
# this augments the set of labels that came with the dataset. some columns have | |
# val labels we can use, but some that should have them are missing them. for | |
# some of these columns we can use the value of the var_label for the column to | |
# come up with the missing val labels. (defendant race is an example in the | |
# dataset) | |
labels <- lapply(column_names, function(l){ | |
key <- as.character(l) | |
val <- purrr::pluck(var_labels, key, .default=l) | |
label <- purrr::pluck(val_labels, key) | |
if(!is.null(label)){ | |
return(label) | |
} | |
if(grepl(might_be_enum_field, val)){ | |
return(parse_val_labels_from_field_name(val)) | |
} | |
}) | |
# make our labels list a named list using the column names (`V${integer}`) | |
names(labels) <- column_names | |
labelled::val_labels(inky_df_src) <- labels | |
# some column names reference other column names using the "V${integer} style; | |
# this function replaces any of those appearances with the readable column name | |
# example: "DISPOSITION V94, V104, INTENT TO MURDER" | |
# note that a returned referenced column may also have references of its own | |
dereference_column_name <- function(col_name){ | |
refs <- unlist(stringr::str_match_all(col_name, 'V\\d+')) | |
if(purrr::is_empty(refs)){ | |
return(col_name) | |
} | |
lapply(refs, function(ref){ | |
val <- dereference_column_name(purrr::pluck(var_labels, ref, .default='')) | |
col_name <<- gsub(ref, val, col_name) | |
}) | |
# all the referenced cols are disposition columns, remove only instances of | |
# that word that have whitespace before it-- these occur within the concat'd | |
# referenced column names | |
return(gsub(' DISPOSITION', '', col_name)) | |
} | |
# the codebook pdf says that in addition to values 0 thru 9 as defined here, | |
# there's also life sentence "++" but it doesn't seem like that exists in the | |
# dataset | |
sentence_enum_to_day_range <- list( | |
# sentence suspended | |
list(ddays(0), ddays(0)), | |
# Fines and costs | |
list(ddays(0), ddays(0)), | |
list(ddays(1), dmonths(3)), | |
list(dmonths(4), dmonths(6)), | |
list(dmonths(7), dmonths(12)), | |
list(dmonths(13), dyears(2)), | |
list(dyears(2) + ddays(1), dyears(5)), | |
list(dyears(5) + ddays(1), dyears(10)), | |
list(dyears(10) + ddays(1), dyears(20)), | |
list(dyears(20) + ddays(1), dyears(Inf)), | |
# life | |
list(dyears(Inf), dyears(Inf)) | |
) | |
sentence_min_bounds <- unlist( | |
map(sentence_enum_to_day_range, function(x){x[1]})) | |
sentence_max_bounds <- unlist( | |
map(sentence_enum_to_day_range, function(x){x[2]})) | |
# use a snake-cased version of the columns var labels so the fields are a little | |
# more readable | |
formatted_col_names <- lapply(column_names, function(e){ | |
val <- purrr::pluck(var_labels, as.character(e), .default=e) | |
val <- dereference_column_name(val) | |
# special case to handle the unlabeled columns which we'll ignore | |
if(val==''){ | |
return(e) | |
} | |
snakecase::to_snake_case(val) | |
}) | |
names(formatted_col_names) <- column_names | |
snake_case_columns <- function(v){ | |
purrr::map_chr(v, function(s) { | |
purrr::pluck(formatted_col_names, s) | |
}) | |
} | |
# some column names are abbreviated, expand those abbreviations in this function | |
standard_column_names <- function(v){ | |
gsub('aggr_a_b', 'aggr_assault_and_battery', v) | |
} | |
# wrap up all our column renamings into one function | |
inky_rename_columns <- function(df) { | |
df |> | |
dplyr::rename_with(snake_case_columns) |> | |
dplyr::rename(all_of(renamings)) |> | |
dplyr::rename_with(standard_column_names) | |
} | |
# consolidates the set of date fields that exist as three columns and follow a | |
# naming pattern into their own date columns and drops the three source columns | |
datepart_fields_mutations <- function(df) { | |
event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment', | |
'trial', 'sentence') | |
datepart_names <- c('year_of', 'month_of', 'day_of') | |
# new columns we're creating | |
new_field_names <- map(event_names, function(e){ | |
sym(paste0(e, '_date')) | |
}) | |
drop_columns <- unname(unlist(map(event_names, function(e){ | |
lapply(datepart_names, function(d){ | |
paste0(d, '_', e) | |
}) | |
}))) | |
operations <- map(event_names, function(e){ | |
args <- map(datepart_names, function(d){ | |
var_name <- paste0(d, "_", e) | |
quo(purrr::pluck(!!sym(var_name), 1)) | |
}) | |
quo(inky_dataset_date_format(!!!args)) | |
}) | |
args <- setNames( | |
operations, | |
new_field_names | |
) | |
df |> | |
dplyr::mutate(!!!args) |> | |
dplyr::select(-all_of(drop_columns)) | |
} | |
# a list of manual column renamings | |
renamings <- c( | |
"race_defendant" = "race_1_white_2_black_3_other", | |
"prior_arrests" = "prior_arrests_1_yes_2_no", | |
"transcript_available" = "transcript_1_yes_2_no_3_copied", | |
"race_judge" = "race_of_judge_1_black_2_white", | |
"jury_trial" = "jury_trial_1_yes_2_no", | |
"had_transcript" = "was_there_a_transcript_1_yes_2_no", | |
"party_of_judge" = "party_of_judge_1_dem_2_rep", | |
"was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no", | |
"race_of_victim" = "race_of_victim_1_white_2_black_3_other", | |
"data_compiled_by" = "data_compiled_by_1_barlett_2_steele", | |
"sex_of_victim" = "sex_of_victim_1_male_2_female", | |
"sex_of_defendant" = "sex_of_defendant_1_male_2_female" | |
) | |
# do a buncha transforms to consolidate columns | |
inky_df <- inky_df_src |> | |
inky_rename_columns() |> | |
# turns all the spss integers into their factor values | |
haven::as_factor() |> | |
# now that we've got some nicer column names and values to work with, add some | |
# nicer data types | |
dplyr::mutate( | |
year_born = inky_dataset_year(year_born) | |
) |> | |
dplyr::rowwise() |> | |
datepart_fields_mutations() |> | |
# incident date doesn't fit the pattern of the other fields as defined in | |
# datepart_fields_mutations() so do that one manually | |
dplyr::mutate( | |
incident_date = inky_dataset_date_format( | |
year_of_incident_last_2_digits[1], | |
month_of_incident_1_thru_12[1], | |
day_of_incident_1_thru_31[1]) | |
) |> | |
# drop these columns that were condensed into a single date column | |
dplyr::select(-all_of(c( | |
'year_of_incident_last_2_digits', | |
'month_of_incident_1_thru_12', | |
'day_of_incident_1_thru_31' | |
))) |> | |
# drop these two columns which are unnamed and null | |
dplyr::select(-all_of(c( | |
'V67', | |
'V138' | |
))) | |
col_names <- names(inky_df) | |
disposition_cols <- names(inky_df[grepl('^disposition', col_names)]) | |
probation_cols <- names(inky_df[grepl('^probation', col_names)]) | |
min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)]) | |
max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)]) | |
multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)]) | |
sentencing_columns <- c( | |
min_sentence_cols, | |
max_sentence_cols, | |
multi_sentence_cols) | |
# for debugging-- collect all the names of columns that have a value, summarize | |
# as one text field | |
summary_column <- function(pattern, columns) { | |
return(function(v){ | |
s <- stringr::str_sort(columns[!is.na(v)]) | |
paste0(gsub(pattern, '', s), collapse='||') | |
}) | |
} | |
summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary') | |
inky_df <- inky_df |> | |
rowwise() |> | |
# for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row | |
mutate( | |
charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))), | |
charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))), | |
charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))), | |
charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))), | |
charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))) | |
) | |
# create a dispositions table with three columns: unique_case_id, charge, | |
# disposition_outcome | |
# example row: | |
# unique_case_id | charge | disposition_outcome | |
# ---------------|--------------------|-------------------- | |
# 7466801 | murder | FND GLTY-LSSR OFFNS | |
dispositions <- inky_df[c('unique_case_id', disposition_cols)] |> | |
pivot_longer( | |
cols = all_of(disposition_cols), | |
names_to = c("charge"), | |
values_to = c("disposition_outcome"), | |
names_transform = list( | |
charge = function(v){ | |
purrr::map_chr(v, function(c){ | |
gsub('disposition_', '', c) | |
}) | |
} | |
) | |
) |> | |
filter(!is.na(disposition_outcome)) |> | |
dplyr::distinct() | |
# create a probations table with three columns: unique_case_id, charge, | |
# probation | |
# example table: | |
# unique_case_id | charge | probation | |
# ---------------|--------------------|-------------------- | |
# 7466801 | murder | YES-NO TIME | |
# 7466802 | murder | YES-WITH TIME | |
# | |
# YES-NO TIME: yes, with no time in jail | |
# YES-WITH TIME: yes, after time served | |
# a YES-WITH TIME value can indicate that jail time was sentenced for another | |
# charge in the same case. you can see these when joining on the sentences table | |
# | |
# a note about probations, from the codebook pdf included with the dataset: | |
# | |
# treat missing data cautiously! For example, a code of 99 (na) on the | |
# probation items may refer to 'not applicable', 'no probation', or 'missing | |
# information'. Not applicables can, with a bit of effort, be manipulated into | |
# a separate category. There is however, no viable means of separating the | |
# rest of the 99's into one category consisting of 'no probation' and another | |
# consisting of 'not ascertained' (which may include individuals who did or | |
# did no receive probation). In other words, do not assume that everyone in | |
# the residual category did not receive a probationary sentence | |
# | |
# with that caveat, this table is probably not useful for any aggregate analysis | |
probations <- inky_df[c('unique_case_id', probation_cols)] |> | |
pivot_longer( | |
cols = all_of(probation_cols), | |
names_to = c("charge"), | |
values_to = c("probation"), | |
names_transform = list( | |
charge = function(v){ | |
purrr::map_chr(v, function(c){ | |
gsub('probation_', '', c) | |
}) | |
} | |
) | |
) |> | |
dplyr::filter(!is.na(probation)) |> | |
dplyr::distinct() | |
# CPI index value, Dec 2022 | |
# https://fred.stlouisfed.org/release/tables?rid=10&eid=36163# | |
# https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation | |
# default values are current: Dec 2022, past: Dec 1971 | |
inflation_adjustment <- function(v, | |
current_idx_value=294.883, | |
past_index_value=43.300) { | |
v * (current_idx_value / past_index_value) | |
} | |
bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] |> | |
dplyr::rename( | |
'amount'='bail_in_wob_999999_nominal_000001' | |
) |> | |
# not sure what a null amount means, so remove | |
dplyr::filter(!is.na(amount)) |> | |
mutate ( | |
# value is actually 99999, note 999999 as noted in original column label | |
held_without_bail = if_else(amount == 99999, TRUE, FALSE), | |
real_amount = if_else(amount == 99999, | |
Inf, | |
inflation_adjustment(amount)) | |
) |> | |
dplyr::distinct() | |
multi_sentences <- inky_df_src |> | |
inky_rename_columns() |> | |
mutate( | |
across( | |
all_of(multi_sentence_cols), | |
haven::as_factor | |
) | |
) | |
# a table describing concurrent sentencing (if any) by charge and unique_case_id | |
# example table: | |
# unique_case_id | charge | concurrency | |
# ---------------+---------------------+--------------------- | |
# 26792592 | assault_and_battery | YES-CONCURRENT TERMS | |
# 26792592 | aggravated_robbery | NO MULT SENT | |
# 36052400 | attempted_larceny | YES-BOTH | |
multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] |> | |
pivot_longer( | |
cols = all_of(multi_sentence_cols), | |
names_to = c("charge"), | |
values_to = c("concurrency"), | |
names_transform = list( | |
charge = function(v){ | |
purrr::map_chr(v, function(c){ | |
gsub('multiple_sentence_', '', c) | |
}) | |
} | |
) | |
) |> | |
filter(!is.na(concurrency)) |> | |
dplyr::distinct() | |
# transforms for a sentences table. in the wide version of the table, create | |
# columns for the lower and upper bound (in seconds) of the sentencing and add a | |
# description column by converting the wide column values into factors | |
sentences_src <- inky_df_src |> | |
inky_rename_columns() |> | |
mutate( | |
across( | |
all_of(c(min_sentence_cols, max_sentence_cols)), | |
.fns = list( | |
lower_bound = function(v){ | |
sentence_min_bounds[v] | |
}, | |
upper_bound = function(v){ | |
sentence_max_bounds[v] | |
}, | |
description = haven::as_factor | |
), | |
.names = "{.col}||{.fn}" | |
) | |
) |> | |
# drop the columns that have been converted into bounds and descriptions | |
dplyr::select(-all_of(c( | |
min_sentence_cols, | |
max_sentence_cols | |
))) |> | |
# rename the columns to a format where values are separated by '||' | |
dplyr::rename_with(function(v){ | |
purrr::map_chr(v, function(s) { | |
if(grepl('^minimum|^maximum', s)){ | |
return(sub('sentence_', 'sentence||', s)) | |
} | |
s | |
}) | |
}) | |
sentence_details_cols <- names(sentences_src[ | |
grepl('minimum_sentence|maximum_sentence', names(sentences_src))]) | |
# sentences spec for pivot_longer | |
# .name | .value | charge | sentence_type | |
# example row: | |
# minimum_sentence||murder||lower_bound | lower_bound | murder | minimum_sentence | |
sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){ | |
c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\|\\|')[[1]]) | |
c(.name=e, .value=value, charge=charge, sentence_type=sentence_type) | |
}) | |
sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows)) | |
# create our sentences table: | |
# unique_case_id, sentence type, charge, lower_bound, upper_bound, description | |
sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] |> | |
tidyr::pivot_longer_spec(sentences_spec) |> | |
# in our lengthening, we've created a lot of what are null rows so drop any | |
# rows where description is na | |
dplyr::filter(!is.na(description)) |> | |
dplyr::distinct() | |
# create a cases_inky table that's a little easier to digest: minus all the columns | |
# that are now lengthened in different tables. | |
# note the the unique_case_id won't be unique in this table-- there can be | |
# multiple rows if there were multiple charges | |
cases_inky <- inky_df |> | |
dplyr::select(-all_of(c( | |
sentencing_columns, | |
disposition_cols, | |
probation_cols | |
))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment