--- title: "Claim Counts Prediction Using Individual Data" author: "Gabriele Pittarello" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Claim Counts Prediction Using Individual Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r eval=FALSE, include=TRUE} library(ReSurv) library(reticulate) use_virtualenv("pyresurv") library(ggplot2) ``` In this vignette we show a pipeline for predicting individual claim counts using the `ReSurv` package. This vignette was created for the *2024 Reserving Call Paper Program on Technology and the Reserving Actuary* of the Casualty Actuarial Society (CAS) and the CAS Reserves Working Group. We would like to thank the team of reviewers that provided feedback that improved our package. # Installation To install the developer version of the `ReSurv` package. ```{r eval=FALSE, include=TRUE} library(devtools) devtools::install_github("edhofman/resurv") library(ReSurv) packageVersion("ReSurv") ``` To handle the Python dependencies for *the first package installation*, we suggest to create a dedicated virtual environment. The remaining part of this Section can be disregard from users that are not interested in using our models based on Neural Networks. ```{r eval=FALSE, include=TRUE} ReSurv::install_pyresurv() ``` The default name of the virtual environment is `"pyresurv"`. We then suggest to refresh the R session and to import the `ReSurv` package in R using ```{r eval=FALSE, include=TRUE} library(ReSurv) reticulate::use_virtualenv("pyresurv") ``` ## Managing Multiple Package Dependencies For the case of multiple packages using isolated-package-environments we suggest the following procedure. Below an example for `pysparklyr`. ```{r eval=FALSE, include=TRUE} envname <- "./venv" ReSurv::install_pyresurv(envname = envname) pysparklyr::install_pyspark(envname = envname) ``` # Data Simulation We simulate a synthetic data set from scenario Alpha. ```{r eval=FALSE, include=TRUE} input_data <- data_generator(random_seed = 7, scenario = "alpha", time_unit = 1 / 360, years = 4, period_exposure = 200) ``` We inspect the data set. ```{r eval=FALSE, include=TRUE} str(input_data) ``` # Data Pre-Processing The data is pre-processed using the `IndividualDataPP` function. ```{r eval=FALSE, include=TRUE} individual_data <- IndividualDataPP(data = input_data, categorical_features = "claim_type", continuous_features = "AP", accident_period = "AP", calendar_period = "RP", input_time_granularity = "days", output_time_granularity = "quarters", years = 4) ``` # Selection of the hyper-parameters This Sections first shows the usage of the `ReSurvCV` function for cross-validation. Secondly, we show how to combine the `ReSurvCV` function into the `ParBayesianOptimization`package routine for performing the Bayesian Optimization of Hyperparameters. ## XGB: K-Fold cross-validation (CV) Example of K-Fold cross-validation (CV) for the XGB model. ```{r eval=FALSE, include=TRUE} resurv_cv_xgboost <- ReSurvCV(IndividualDataPP = individual_data, model = "XGB", hparameters_grid = list(booster = "gbtree", eta = c(.001), max_depth = c(3), subsample = c(1), alpha = c(0), lambda = c(0), min_child_weight = c(.5)), print_every_n = 1L, nrounds = 1, verbose = FALSE, verbose.cv = TRUE, early_stopping_rounds = 1, folds = 5, parallel = TRUE, ncores = 2, random_seed = 1) ``` ## NN: K-Fold cross-validation Example of K-Fold cross-validation (CV) for the NN model. ```{r eval=FALSE, include=TRUE} resurv_cv_nn <- ReSurvCV(IndividualDataPP = individual_data, model = "NN", hparameters_grid = list(num_layers = c(1, 2), num_nodes = c(2, 4), optim = "Adam", activation = "ReLU", lr = .5, xi = .5, eps = .5, tie = "Efron", batch_size = as.integer(5000), early_stopping = "TRUE", patience = 20), epochs = as.integer(300), num_workers = 0, verbose = FALSE, verbose.cv = TRUE, folds = 3, parallel = FALSE, random_seed = as.integer(Sys.time())) ``` ## ReSurv and Bayesian Parameters Optimisation ### XGB: Bayesian Parameters Optimisation Example of Bayesian Optimization of Hyperparameters for the XGB model. ```{r eval=FALSE, include=TRUE} bounds <- list(eta = c(0, 1), max_depth = c(1L, 25L), min_child_weight = c(0, 50), subsample = c(0.51, 1), lambda = c(0, 15), alpha = c(0, 15)) obj_func <- function(eta, max_depth, min_child_weight, subsample, lambda, alpha) { xgbcv <- ReSurvCV( IndividualDataPP = individual_data, model = "XGB", hparameters_grid = list( booster = "gbtree", eta = eta, max_depth = max_depth, subsample = subsample, alpha = lambda, lambda = alpha, min_child_weight = min_child_weight ), print_every_n = 1L, nrounds = 500, verbose = FALSE, verbose.cv = TRUE, early_stopping_rounds = 30, folds = 3, parallel = FALSE, random_seed = as.integer(Sys.time()) ) lst <- list(Score = -xgbcv$out.cv.best.oos$test.lkh, train.lkh = xgbcv$out.cv.best.oos$train.lkh) return(lst) } bayes_out <- bayesOpt(FUN = obj_func, bounds = bounds, initPoints = 50, iters.n = 1000, iters.k = 50, otherHalting = list(timeLimit = 18000)) ``` ### NN: Bayesian Parameters Optimisation Example of Bayesian Optimization of Hyperparameters for the XGB model. ```{r eval=FALSE, include=TRUE} bounds <- list(num_layers = c(2L, 10L), num_nodes = c(2L, 10L), optim = c(1L, 2L), activation = c(1L, 2L), lr = c(0.005, 0.5), xi = c(0, 0.5), eps = c(0, 0.5)) obj_func <- function(num_layers, num_nodes, optim, activation, lr, xi, eps) { optim <- switch(optim, "Adam", "SGD") activation <- switch(activation, "LeakyReLU", "SELU") batch_size <- as.integer(5000L) number_layers <- as.integer(num_layers) num_nodes <- as.integer(num_nodes) deepsurv_cv <- ReSurvCV(IndividualData = individual_data, model = "NN", hparameters_grid = list(num_layers = number_layers, num_nodes = num_nodes, optim = optim, activation = activation, lr = lr, xi = xi, eps = eps, tie = "Efron", batch_size = batch_size, early_stopping = "TRUE", patience = 20), epochs = as.integer(300), num_workers = 0, verbose = FALSE, verbose.cv = TRUE, folds = 3, parallel = FALSE, random_seed = as.integer(Sys.time())) lst <- list(Score = -deepsurv_cv$out.cv.best.oos$test.lkh, train.lkh = deepsurv_cv$out.cv.best.oos$train.lkh) return(lst) } bayes_out <- bayesOpt(FUN = obj_func, bounds = bounds, initPoints = 50, iters.n = 1000, iters.k = 50, otherHalting = list(timeLimit = 18000)) ``` # Estimation In this Section we show the usage of the `ReSurv` method for the estimation of our models parameters. The hyperparameters for NN and XGB were derived with the Bayesian Optimisation procedure. ## COX ```{r eval=FALSE, include=TRUE} resurv_fit_cox <- ReSurv(individual_data, hazard_model = "COX") ``` ## XGB ```{r eval=FALSE, include=TRUE} hparameters_xgb <- list( params = list( booster = "gbtree", eta = 0.9611239, subsample = 0.62851, alpha = 5.836211, lambda = 15, min_child_weight = 29.18158, max_depth = 1 ), print_every_n = 0, nrounds = 3000, verbose = FALSE, early_stopping_rounds = 500 ) resurv_fit_xgb <- ReSurv(individual_data, hazard_model = "XGB", hparameters = hparameters_xgb) ``` ## NN ```{r eval=FALSE, include=TRUE} hparameters_nn = list(num_layers= 2, early_stopping= TRUE, patience=350, verbose=FALSE, network_structure=NULL, num_nodes= 10, activation ="LeakyReLU", optim ="SGD", lr =0.02226655, xi=0.4678993, epsilon= 0, batch_size= 5000L, epochs= 5500L, num_workers= 0, tie="Efron" ) resurv_fit_nn <- ReSurv(individual_data, hazard_model = "NN", hparameters = hparameters_nn) ``` # Prediction In this Section we show how to use our models for predictions with the `predict` method and for different output time granularities (quarterly, yearly and monthly). The output can be displayed using the methods `summary` and `print`. ```{r eval=FALSE, include=TRUE} resurv_fit_predict_q <- predict(resurv_fit_cox) individual_data_y <- IndividualDataPP(input_data, id = "claim_number", continuous_features = "AP_i", categorical_features = "claim_type", accident_period = "AP", calendar_period = "RP", input_time_granularity = "days", output_time_granularity = "years", years = 4, continuous_features_spline = NULL, calendar_period_extrapolation = FALSE) resurv_fit_predict_y <- predict(resurv_fit_cox, newdata = individual_data_y, grouping_method = "probability") individual_data_m <- IndividualDataPP(input_data, id = "claim_number", continuous_features = "AP_i", categorical_features = "claim_type", accident_period = "AP", calendar_period = "RP", input_time_granularity = "days", output_time_granularity = "months", years = 4, continuous_features_spline = NULL, calendar_period_extrapolation = FALSE) resurv_fit_predict_m <- predict(resurv_fit_cox, newdata = individual_data_m, grouping_method = "probability") model_s <- summary(resurv_fit_predict_y) print(model_s) ``` Similar functions for the other models ```{r eval=FALSE, include=TRUE} resurv_predict_xgb <- predict(resurv_fit_xgb) resurv_predict_nn <- predict(resurv_fit_nn) ``` # Plot the predicted development factors In this Section we show an example of development factors plot for the COX model. ```{r eval=FALSE, include=TRUE} resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% filter(AP_o==15 & claim_type == 1) %>% filter(row_number()==1) %>% select(group_o) plot(resurv_fit_predict_q, granularity = "output", title_par = "COX: Accident Quarter 15 Claim Type 1", group_code = 30) ``` # CRPS In this Section we show the CRPS calculation using the `survival_crps` method. ```{r eval=FALSE, include=TRUE} crps <- survival_crps(resurv_fit_cox) m_crps <- mean(crps$crps) m_crps ``` # Plot the development factors We first extract the output group code for an arbitrary group. ```{r eval=FALSE, include=TRUE} resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% filter(AP_o==15 & claim_type == 1) %>% filter(row_number()==1) %>% select(group_o) ``` We use the group code to plot the feature dependent development factors. ```{r eval=FALSE, include=TRUE} plot(resurv_fit_predict_q, granularity = "output", title_par = "COX: Accident Quarter 15 Claim Type 1", group_code = 30) ``` # Appendix ## Code to replicate Figure 1 ```{r eval=FALSE, include=TRUE} p1 <- input_data %>% as.data.frame() %>% mutate(claim_type = as.factor(claim_type)) %>% ggplot(aes(x = RT - AT, color = claim_type)) + stat_ecdf(size = 1) + labs(title = "", x = "Notification delay (in days)", y = "ECDF") + xlim(0.01, 1500) + scale_color_manual( values = c("royalblue", "#a71429"), labels = c("Claim type 0", "Claim type 1") ) + scale_linetype_manual(values = c(1, 3), labels = c("Claim type 0", "Claim type 1")) + guides( color = guide_legend( title = "Claim type", override.aes = list( color = c("royalblue", "#a71429"), linewidth = 2 ) ), linetype = guide_legend( title = "Claim type", override.aes = list(linetype = c(1, 3), linewidth = 0.7) ) ) + theme_bw() + theme( axis.text = element_text(size = 20), axis.title.y = element_text(size = 20), axis.title.x = element_text(size = 20), legend.text = element_text(size = 20) ) p1 p2 <- input_data %>% as.data.frame() %>% mutate(claim_type = as.factor(claim_type)) %>% ggplot(aes(x = claim_type, fill = claim_type)) + geom_bar() + scale_fill_manual( values = c("royalblue", "#a71429"), labels = c("Claim type 0", "Claim type 1") ) + guides(fill = guide_legend(title = "Claim type")) + theme_bw() + labs(title = "", x = "Claim Type", y = "") + theme( axis.text = element_text(size = 20), axis.title.y = element_text(size = 20), axis.title.x = element_text(size = 20), legend.text = element_text(size = 20) ) p2 ``` ## Code for plotting feature-dependent development factors Chain-Ladder case. ```{r eval=FALSE, include=TRUE} clmodel <- resurv_fit_cox$IndividualDataPP$training.data %>% mutate(DP_o = 16 - DP_rev_o + 1) %>% group_by(AP_o, DP_o) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(AP_o) %>% arrange(DP_o) %>% mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% ungroup() %>% group_by(DP_o) %>% reframe(df_o = sum(I_cum * ( AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o + 1 )) / sum(I_cum_lag * ( AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o + 1 )), I = sum(I * ( AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o ))) %>% mutate(DP_o_join = DP_o - 1) %>% as.data.frame() clmodel %>% filter(DP_o > 1) %>% ggplot(aes(x = DP_o, y = df_o)) + geom_line(linewidth = 2.5, color = "#454555") + labs(title = "Chain ladder", x = "Development quarter", y = "Development factor") + ylim(1, 3. + .01) + theme_bw(base_size = rel(5)) + theme(plot.title = element_text(size = 20)) ## clmodel_months <- individual_data_m$training.data %>% mutate(DP_o = 48 - DP_rev_o + 1) %>% group_by(AP_o, DP_o) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(AP_o) %>% arrange(DP_o) %>% mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% ungroup() %>% group_by(DP_o) %>% reframe(df_o = sum(I_cum * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 )) / sum(I_cum_lag * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 )), I = sum(I * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o ))) %>% mutate(DP_o_join = DP_o - 1) %>% as.data.frame() ticks_at <- seq(1, 48, 4) labels_as <- as.character(ticks_at) clmodel_months %>% filter(DP_o > 1) %>% ggplot(aes(x = DP_o, y = df_o)) + geom_line(linewidth = 2.5, color = "#454555") + labs(title = "Chain ladder", x = "Development month", y = "Development factor") + ylim(1, 2.5 + .01) + scale_x_continuous(breaks = ticks_at, labels = labels_as) + theme_bw(base_size = rel(5)) + theme(plot.title = element_text(size = 20)) ## clmodel_years <- individual_data_y$training.data %>% mutate(DP_o = 4 - DP_rev_o + 1) %>% group_by(AP_o, DP_o) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(AP_o) %>% arrange(DP_o) %>% mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% ungroup() %>% group_by(DP_o) %>% reframe(df_o = sum(I_cum * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 )) / sum(I_cum_lag * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 )), I = sum(I * ( AP_o <= max(individual_data_m$training.data$AP_o) - DP_o ))) %>% mutate(DP_o_join = DP_o - 1) %>% as.data.frame() ticks_at <- seq(1, 4, 1) labels_as <- as.character(ticks_at) clmodel_years %>% filter(DP_o > 1) %>% ggplot(aes(x = DP_o, y = df_o)) + geom_line(linewidth = 2.5, color = "#454555") + labs(title = "Chain ladder", x = "Development year", y = "Development factor") + ylim(1, 2.5 + .01) + scale_x_continuous(breaks = ticks_at, labels = labels_as) + theme_bw(base_size = rel(5)) + theme(plot.title = element_text(size = 20)) ``` COX, quarterly output. ```{r eval=FALSE, include=TRUE} ct <- 0 ap <- 12 resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% filter(AP_o == ap & claim_type == ct) %>% filter(row_number() == 1) %>% select(group_o) plot( resurv_fit_predict_q, granularity = "output", title_par = "COX: Accident Quarter 12 Claim Type 0", group_code = 23 ) ct <- 0 ap <- 36 resurv_fit_predict_m$long_triangle_format_out$output_granularity %>% filter(AP_o == ap & claim_type == ct) %>% filter(row_number() == 1) %>% select(group_o) plot( resurv_fit_predict_m, granularity = "output", color_par = "#a71429", title_par = "COX: Accident Month 36 Claim Type 0", group_code = 71 ) ct <- 1 ap <- 7 resurv_fit_predict_m$long_triangle_format_out$output_granularity %>% filter(AP_o == ap & claim_type == ct) %>% filter(row_number() == 1) %>% select(group_o) plot( resurv_fit_predict_m, granularity = "output", color_par = "#a71429", title_par = "COX: Accident Month 7 Claim Type 1", group_code = 14 ) ct <- 0 ap <- 2 resurv_fit_predict_y$long_triangle_format_out$output_granularity %>% filter(AP_o == ap & claim_type == ct) %>% filter(row_number() == 1) %>% select(group_o) plot( resurv_fit_predict_y, granularity = "output", color_par = "#FF6A7A", title_par = "COX: Accident Year 2 Claim Type 0", group_code = 3 ) ct <- 1 ap <- 3 resurv_fit_predict_y$long_triangle_format_out$output_granularity %>% filter(AP_o == ap & claim_type == ct) %>% filter(row_number() == 1) %>% select(group_o) plot( resurv_fit_predict_y, granularity = "output", color_par = "#FF6A7A", title_par = "COX: Accident Year 3 Claim Type 1", group_code = 6 ) ``` ## Code for computing ARE TOT and ARE CAL The code below can be used to compute the performance metrics in Table 6 of the manuscript. ```{r eval=FALSE, include=TRUE} conversion_factor <- individual_data$conversion_factor max_dp_i <- 1440 true_output <- resurv_fit_cox$IndividualDataPP$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor), TR_o = AP_o - 1 ) %>% filter(DP_rev_o <= TR_o) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% summarize(I = sum(I), .groups = "drop") %>% filter(DP_rev_o > 0) out_list <- resurv_fit_predict_q$long_triangle_format_out out <- out_list$output_granularity #Total output score_total <- out[, c("claim_type", "AP_o", "DP_o", "expected_counts")] %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>% mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>% # from here it is reformulated for the are tot ungroup() %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave = abs(sum(ave)), I = sum(I)) are_tot <- sum(score_total$abs_ave) / sum(score_total$I) dfs_output <- out %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% select(AP_o, claim_type, DP_rev_o, f_o) %>% mutate(DP_rev_o = DP_rev_o) %>% distinct() #Cashflow on output scale.Etc quarterly cashflow development score_diagonal <- resurv_fit_cox$IndividualDataPP$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% left_join(dfs_output, by = c("AP_o", "claim_type", "DP_rev_o")) %>% mutate(I_cum_hat = I_cum_lag * f_o, RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct() , by = c("AP_o", "DP_rev_o")) %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) are_cal_q <- sum(score_diagonal$abs_ave2_diag) / sum(score_diagonal$I) # scoring XGB ---- out_xgb <- resurv_predict_xgb$long_triangle_format_out$output_granularity score_total_xgb <- out_xgb[, c("claim_type", "AP_o", "DP_o", "expected_counts")] %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>% mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>% # from here it is reformulated for the are tot ungroup() %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave = abs(sum(ave)), I = sum(I)) are_tot_xgb <- sum(score_total_xgb$abs_ave) / sum(score_total$I) dfs_output_xgb <- out_xgb %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% select(AP_o, claim_type, DP_rev_o, f_o) %>% mutate(DP_rev_o = DP_rev_o) %>% distinct() #Cashflow on output scale.Etc quarterly cashflow development score_diagonal_xgb <- resurv_fit_cox$IndividualDataPP$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% left_join(dfs_output_xgb, by = c("AP_o", "claim_type", "DP_rev_o")) %>% mutate(I_cum_hat = I_cum_lag * f_o, RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct() , by = c("AP_o", "DP_rev_o")) %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) are_cal_q_xgb <- sum(score_diagonal_xgb$abs_ave2_diag) / sum(score_diagonal$I) # scoring NN ---- out_nn <- resurv_predict_nn$long_triangle_format_out$output_granularity score_total_nn <- out_nn[, c("claim_type", "AP_o", "DP_o", "expected_counts")] %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>% mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>% # from here it is reformulated for the are tot ungroup() %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave = abs(sum(ave)), I = sum(I)) are_tot_nn <- sum(score_total_nn$abs_ave) / sum(score_total$I) dfs_output_nn <- out_nn %>% mutate(DP_rev_o = 16 - DP_o + 1) %>% select(AP_o, claim_type, DP_rev_o, f_o) %>% mutate(DP_rev_o = DP_rev_o) %>% distinct() #Cashflow on output scale.Etc quarterly cashflow development score_diagonal_nn <- resurv_fit_cox$IndividualDataPP$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% left_join(dfs_output_nn, by = c("AP_o", "claim_type", "DP_rev_o")) %>% mutate(I_cum_hat = I_cum_lag * f_o, RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct() , by = c("AP_o", "DP_rev_o")) %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) are_cal_q_nn <- sum(score_diagonal_nn$abs_ave2_diag) / sum(score_diagonal$I) # Scoring Chain-Ladder ---- true_output_cl <- individual_data$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor) ) %>% filter(DP_rev_o <= TR_o) %>% mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>% group_by(AP_o, DP_o, DP_rev_o) %>% summarize(I = sum(I), .groups = "drop") %>% filter(DP_rev_o > 0) latest_observed <- individual_data$training.data %>% filter(DP_rev_o >= TR_o) %>% mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>% group_by(AP_o) %>% mutate(DP_max = max(DP_o)) %>% group_by(AP_o, DP_max) %>% summarize(I = sum(I), .groups = "drop") clmodel <- individual_data$training.data %>% mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>% group_by(AP_o, DP_o) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(AP_o) %>% arrange(DP_o) %>% mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% ungroup() %>% group_by(DP_o) %>% reframe(df = sum(I_cum * ( AP_o <= max(individual_data$training.data$AP_o) - DP_o + 1 )) / sum(I_cum_lag * ( AP_o <= max(individual_data$training.data$AP_o) - DP_o + 1 )), I = sum(I * ( AP_o <= max(individual_data$training.data$AP_o) - DP_o ))) %>% mutate(DP_o_join = DP_o) %>% mutate(DP_rev_o = max(DP_o) - DP_o + 1) predictions <- expand.grid(AP_o = latest_observed$AP_o, DP_o = clmodel$DP_o_join) %>% left_join(clmodel[, c("DP_o_join", "df")], by = c("DP_o" = "DP_o_join")) %>% left_join(latest_observed, by = "AP_o") %>% rowwise() %>% filter(DP_o > DP_max) %>% ungroup() %>% group_by(AP_o) %>% arrange(DP_o) %>% mutate(df_cum = cumprod(df)) %>% mutate(I_expected = I * df_cum - I * lag(df_cum, default = 1)) %>% select(DP_o, AP_o, I_expected) conversion_factor <- individual_data$conversion_factor max_dp_i <- 1440 score_total <- predictions %>% inner_join(true_output_cl, by = c("AP_o", "DP_o")) %>% mutate(ave = I - I_expected, abs_ave = abs(ave)) %>% # from here it is reformulated for the are tot ungroup() %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave = abs(sum(ave)), I = sum(I)) are_tot <- sum(score_total$abs_ave) / sum(score_total$I) score_diagonal <- individual_data$full.data %>% mutate( DP_rev_o = floor(max_dp_i * conversion_factor) - ceiling( DP_i * conversion_factor + ((AP_i - 1) %% ( 1 / conversion_factor )) * conversion_factor) + 1, AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>% left_join(CL[, c("DP_o", "df")], by = c("DP_o")) %>% mutate(I_cum_hat = I_cum_lag * df, RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% inner_join(true_output_cl[, c("AP_o", "DP_rev_o")] %>% distinct() , by = c("AP_o", "DP_rev_o")) %>% group_by(AP_o, DP_rev_o) %>% reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) are_cal_q <- sum(score_diagonal$abs_ave2_diag) / sum(score_diagonal$I) ```