ccao-data · dfsnow · Jan 28, 2024 · Dec 11, 2023 · Dec 11, 2023 · Dec 12, 2023
@@ -26,3 +26,6 @@ cache/
 
 # Ignore scratch documents
 scratch*.*
+
+# Python files
+__pycache__
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,6 +24,7 @@ Depends:
  paws.analytics,
  paws.application.integration,
  recipes,
+ reticulate,
  rlang,
  rsample,
  stringr,

@@ -14,9 +14,9 @@ ENV RENV_PATHS_CACHE /setup/cache
 RUN apt-get update && \
  apt-get install --no-install-recommends -y \
  libcurl4-openssl-dev libssl-dev libxml2-dev libgit2-dev git \
- libudunits2-dev python3-dev python3-pip libgdal-dev libgeos-dev \
- libproj-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev pandoc \
- curl gdebi-core && \
+ libudunits2-dev python3-dev python3-pip python3-venv libgdal-dev \
+ libgeos-dev libproj-dev libfontconfig1-dev libharfbuzz-dev \
+ libfribidi-dev pandoc curl gdebi-core && \
  rm -rf /var/lib/apt/lists/*
 
 # Install Quarto
@@ -28,7 +28,7 @@ RUN gdebi -n quarto-linux-amd64.deb
 RUN pip install --no-cache-dir dvc[s3]
 
 # Copy R bootstrap files into the image
-COPY renv.lock .Rprofile DESCRIPTION ./
+COPY renv.lock .Rprofile DESCRIPTION requirements.txt ./
 COPY renv/profiles/reporting/renv.lock reporting-renv.lock
 COPY renv/ renv/
 

@@ -182,6 +182,25 @@ extract_num_iterations <- function(x) {
  length(evals)
 }
 
+# Extract weights for model features based on feature importance. Assumes that
+# the model was trained with the `valids` parameter set such that error metrics
+# are saved for each tree on the model$record_evals attribute. The output
+# weights are useful for computing comps using leaf node assignments
+extract_weights <- function(model, mean_sale_price, metric = "rmse") {
+ # Index into the errors list, and un-list so it is a flat/1dim list
+ record_evals <- model$record_evals
+ errors <- unlist(record_evals$tree_errors[[metric]]$eval)
+ # Use the mean sale price as the initial error
+ errors <- c(mean_sale_price, errors)
+ diff_in_errors <- diff(errors, 1, 1)
+
+ # Take proportion of diff in errors over total diff in
+ # errors from all trees
+ weights <- diff_in_errors / sum(diff_in_errors)
+
+ return(weights)
+}
+
 # Given the result of a CV search, get the number of iterations from the
 # result set with the best performing hyperparameters
 select_iterations <- function(tune_results, metric, type = "mean") {

@@ -72,6 +72,10 @@ shap_enable <- as.logical(Sys.getenv(
  "SHAP_ENABLE_OVERRIDE",
  unset = get(params_obj_name)$toggle$shap_enable
 ))
+comp_enable <- as.logical(Sys.getenv(
+ "COMP_ENABLE_OVERRIDE",
+ unset = get(params_obj_name)$toggle$comp_enable
+))
 upload_enable <- as.logical(Sys.getenv(
  "UPLOAD_ENABLE_OVERRIDE",
  unset = get(params_obj_name)$toggle$upload_enable

@@ -121,6 +121,8 @@ stages:
  cache: false
  - output/intermediate/timing/model_timing_interpret.parquet:
  cache: false
+ - output/comp/model_comp.parquet:
+ cache: false
 
  finalize:
  cmd: Rscript pipeline/05-finalize.R
@@ -171,6 +173,7 @@ stages:
  - output/performance/model_performance_assessment.parquet
  - output/performance_quantile/model_performance_quantile_assessment.parquet
  - output/shap/model_shap.parquet
+ - output/comp/model_comp.parquet
  - output/feature_importance/model_feature_importance.parquet
  - output/metadata/model_metadata.parquet
  - output/timing/model_timing.parquet

@@ -23,8 +23,9 @@ output,performance_quantile_test_linear,3,evaluate,ccao-model-results-us-east-1,
 output,performance_quantile_assessment,3,evaluate,ccao-model-results-us-east-1,output/performance_quantile/model_performance_quantile_assessment.parquet,performance_quantile/year={year}/stage=assessment/{run_id}.parquet,performance_quantile,geography [by class] by quantile,"year, run_id, stage, geography_type, geography_id, by_class, class, quantile",Performance metrics by quantile within class and geography,Assessment set uses the prior year sales to compare to the assessed value
 output,shap,4,interpret,ccao-model-results-us-east-1,output/shap/model_shap.parquet,shap/,shap,card,"year, run_id, township_code, meta_pin, meta_card_num",SHAP values for each feature for each card in the assessment data,NOTE: Each run adds new partitions to S3 which must be added via a Glue crawler
 output,feature_importance,4,interpret,ccao-model-results-us-east-1,output/feature_importance/model_feature_importance.parquet,feature_importance/year={year}/{run_id}.parquet,feature_importance,predictor,"year, run_id, model_predictor_all_name","Feature importance values (gain, cover, and frequency) for the run",
+output,comp,4,interpret,ccao-model-results-us-east-1,output/comp/model_comp.parquet,comp/,comp,card,"year, run_id, meta_pin, meta_card_num",Comparables for each card (computed using leaf node assignments),
 output,report_performance,5,finalize,ccao-model-results-us-east-1,reports/performance/performance.html,report/year={year}/report_type=performance/{run_id}.html,,model run,,Rendered Quarto doc with model performance statistics,
 output,report_pin,5,finalize,ccao-model-results-us-east-1,reports/pin/,report/year={year}/report_type=pin/run_id={run_id}/,,model run,,Rendered Quarto doc for individual PINs,
 output,metadata,5,finalize,ccao-model-results-us-east-1,output/metadata/model_metadata.parquet,metadata/year={year}/{run_id}.parquet,metadata,model run,"year, run_id","Information about each run, including parameters, run ID, git info, etc.",
 intermediate,timing,,all,,output/intermediate/timing/,,,model stage,"year, msg",Parquet files for each stage containing the stage time elapsed,Converted into a one-row data frame in the finalize stage
-output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages"
+output,timing,,all,ccao-model-results-us-east-1,output/timing/model_timing.parquet,timing/year={year}/{run_id}.parquet,timing,model run,"year, run_id",Finalized time elapsed for each stage of the run,"Each row represents one run, while columns represent the stages"
@@ -27,6 +27,9 @@ toggle:
  # desirable to save time when testing many models
  shap_enable: FALSE
 
+ # Should comps be calculated for this run in the interpret stage?
+ comp_enable: FALSE
+
  # Upload all modeling artifacts and results to S3 in the upload stage. Set
  # to FALSE if you are not a CCAO employee
  upload_enable: TRUE

@@ -145,7 +145,7 @@ lgbm_model <- parsnip::boost_tree(
  # using floor(log2(num_leaves)) + add_to_linked_depth. Useful since
  # otherwise Bayesian opt spends time exploring irrelevant parameter space
  link_max_depth = params$model$parameter$link_max_depth,
-
+ save_tree_error = comp_enable,
 
  ### 4.1.2. Tuned Parameters ------------------------------------------------
 

@@ -24,8 +24,8 @@ message("Loading model fit and recipe")
 lgbm_final_full_fit <- lightsnip::lgbm_load(paths$output$workflow_fit$local)
 lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local)
 
-if (shap_enable) {
- message("Loading assessment data for SHAP calculation")
+if (shap_enable || comp_enable) {
+ message("Loading assessment data for SHAP and comp calculation")
 
  # Load the input data used for assessment. This is the universe of CARDs (not
  # PINs) that need values. Will use the the trained model to calc SHAP values
@@ -39,6 +39,13 @@ if (shap_enable) {
  )
 }
 
+if (comp_enable) {
+ message("Loading predicted values for comp calculation")
+
+ assessment_card <- read_parquet(paths$output$assessment_card$local) %>%
+ as_tibble()
+}
+
 
 
 
@@ -105,6 +112,126 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
  rename_with(~ paste0(.x, "_value"), gain:frequency) %>%
  write_parquet(paths$output$feature_importance$local)
 
+
+
+
+#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# 4. Calculate comps -----------------------------------------------------------
+#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+if (comp_enable) {
+ message("Calculating comps")
+
+ # Calculate the leaf node assignments for every predicted value.
+ # Due to integer overflow problems with leaf node assignment, we need to
+ # chunk our data such that they are strictly less than the limit of 1073742
+ # rows. More detail here: https://github.com/microsoft/LightGBM/issues/1884
+ chunk_size <- 500000
+ chunks <- split(
+ assessment_data_prepped,
+ ceiling(seq_along(assessment_data_prepped[[1]]) / chunk_size)
+ )
+ chunked_leaf_nodes <- chunks %>%
+ map(\(chunk) {
+ predict(
+ object = lgbm_final_full_fit$fit,
+ newdata = as.matrix(chunk),
+ type = "leaf",
+ )
+ })
+ # Prefer do.call(rbind, ...) over bind_rows() because the chunks are
+ # not guaranteed to have the same number of rows, and bind_rows() will raise
+ # an error in that case
+ leaf_nodes <- do.call(rbind, chunked_leaf_nodes) %>% as_tibble()
+
+ # Calculate weights representing feature importance, so that we can weight
+ # leaf node assignments based on the most important features.
+ # To do this, we need the training data so that we can compute the mean sale
+ # price and use it as the base model error
+ message("Extracting weights from training data")
+ training_data <- read_parquet(paths$input$training$local) %>%
+ filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
+ as_tibble()
+
+ tree_weights <- extract_weights(
+ model = lgbm_final_full_fit$fit,
+ mean_sale_price = mean(training_data[["meta_sale_price"]]),
+ metric = params$model$objective
+ )
+
+ # Get predicted values and leaf node assignments for the training data
+ training_data_prepped <- recipes::bake(
+ object = lgbm_final_full_recipe,
+ new_data = training_data,
+ all_predictors()
+ )
+ training_leaf_nodes <- predict(
+ object = lgbm_final_full_fit$fit,
+ newdata = as.matrix(training_data_prepped),
+ type = "leaf"
+ ) %>%
+ as_tibble()
+ training_leaf_nodes$predicted_value <- predict(
+ object = lgbm_final_full_fit$fit,
+ newdata = as.matrix(training_data_prepped)
+ ) %>%
+ # Round predicted values down for binning
+ floor()
+
+ # Get predicted values for the assessment set, which we already have in
+ # the assessment card set
+ leaf_nodes$predicted_value <- assessment_data %>%
+ left_join(assessment_card, by = c("meta_pin", "meta_card_num")) %>%
+ # Round predicted values down for binning
+ mutate(pred_card_initial_fmv = floor(pred_card_initial_fmv)) %>%
+ dplyr::pull("pred_card_initial_fmv")
+
+ # Make sure that the leaf node tibbles are all integers, which is what
+ # the comps algorithm expects
+ leaf_nodes <- leaf_nodes %>% mutate_all(as.integer)
+ training_leaf_nodes <- training_leaf_nodes %>% mutate_all(as.integer)
+
+ # Do the comps calculation in Python because the code is simpler and faster
+ message("Calling out to python/comps.py to perform comps calculation")
+ comps_module <- import("python.comps")
+ tryCatch(
+ {
+ comps <- comps_module$get_comps(
+ leaf_nodes, training_leaf_nodes, tree_weights,
+ n = as.integer(20)
+ )
+ },
+ error = function(e) {
+ # Log the full Python traceback in case of an error
+ print(py_last_error())
+ stop("Encountered error in python/comps.py")
+ }
+ )
+ # Correct for the fact that Python is 0-indexed by incrementing the
+ # comp indexes by 1
+ comps[[1]] <- comps[[1]] + 1
+
+ # Translate comp indexes to PINs
+ comps[[1]] <- comps[[1]] %>%
+ mutate_all(\(idx_row) {
+ training_data[idx_row, ]$meta_pin
+ }) %>%
+ cbind(
+ pin = assessment_data$meta_pin,
+ card = assessment_data$meta_card_num
+ ) %>%
+ relocate(pin, card) %>%
+ rename_with(\(colname) gsub("comp_idx_", "comp_pin_", colname))
+
+ # Combine the comp indexes and scores into one dataframe and write to a file
+ cbind(comps[[1]], comps[[2]]) %>%
+ write_parquet(paths$output$comp$local)
+} else {
+ # If comp creation is disabled, we still need to write an empty stub file
+ # so DVC doesn't complain
+ arrow::write_parquet(data.frame(), paths$output$comp$local)
+}
+
 # End the stage timer and write the time elapsed to a temporary file
 tictoc::toc(log = TRUE)
 bind_rows(tictoc::tic.log(format = FALSE)) %>%

@@ -89,6 +89,7 @@ metadata <- tibble::tibble(
  ratio_study_near_column = params$ratio_study$near_column,
  ratio_study_num_quantile = list(params$ratio_study$num_quantile),
  shap_enable = shap_enable,
+ comp_enable = comp_enable,
  cv_enable = cv_enable,
  cv_num_folds = params$cv$num_folds,
  cv_fold_overlap = params$cv$fold_overlap,

@@ -204,6 +204,20 @@ if (upload_enable) {
  relocate(run_id) %>%
  write_parquet(paths$output$feature_importance$s3)
 
+ # Upload comps
+ if (comp_enable) {
+ message("Uploading comps")
+ read_parquet(paths$output$comp$local) %>%
+ mutate(run_id = run_id, year = params$assessment$working_year) %>%
+ group_by(year, run_id) %>%
+ arrow::write_dataset(
+ path = paths$output$comp$s3,
+ format = "parquet",
+ hive_style = TRUE,
+ compression = "snappy"
+ )
+ }
+
 
  # 2.5. Finalize --------------------------------------------------------------
  message("Uploading run metadata, timings, and reports")