IMR Machine Learning Code.

For anyone else who wants to build on this, here’s the code to generate the prediction algorithm for my Independent Medical Review Prediction App:

In the R language

#Libraries

require(tidyr)
require(caret)
require(randomForest)
require(FactoMineR)
require(magrittr)
require(tm)

#Load Data
df<-read.csv(“link to data goes here”, stringsAsFactors=F)

str(df)

#Data Cleaning
#https://stackoverflow.com/questions/33180058/coerce-multiple-columns-to-factors-at-once

cols<-c(“DiagnosisCategory”, “DiagnosisSubCategory”, “TreatmentCategory”,
“TreatmentSubCategory”, “Determination”, “Type”, “AgeRange”, “PatientGender”, “IMRType”)
df[cols]<-lapply(df[cols], function (y) gsub(“[[:punct:]]”, “”, y))

#Removes all special characters, we’ve got some weird ones
df[cols]<-lapply(df[cols], function (y) gsub(” “, “”, y))
df[cols]<-lapply(df[cols], as.factor)

str(df)
#Replace NAs and drop dead columns

sum(is.na(df$ReportYear))
df[is.na(df)]<-“NA”

df$ReferenceID<-NULL
df$IMRType<-NULL
df$DaysToReview<-NULL
df$DaysToAdopt<-NULL
df$Findings<-NULL

#Manually create dummy factors for identification and variable selection
#https://stackoverflow.com/questions/11952706/generate-a-dummy-variable
str(df)
df<-cbind(subset(df, select=-DiagnosisCategory), model.matrix(~df$DiagnosisCategory+0))
df<-cbind(subset(df, select=-DiagnosisSubCategory), model.matrix(~df$DiagnosisSubCategory+0))
df<-cbind(subset(df, select=-TreatmentCategory), model.matrix(~df$TreatmentCategory+0))
df<-cbind(subset(df, select=-TreatmentSubCategory), model.matrix(~df$TreatmentSubCategory+0))
df<-cbind(subset(df, select=-Type), model.matrix(~df$Type+0))
df<-cbind(subset(df, select=-AgeRange), model.matrix(~df$AgeRange+0))
df<-cbind(subset(df, select=-PatientGender), model.matrix(~df$PatientGender+0))
colnames(df)<-gsub(“df\\$”, “”, colnames(df)) #Otherwise you get colnames like ‘df$DiagnosisCategoryVision’

#More Cleaning, drops useless data
idfCleanData<-function(idf, cutoff=0.97) {
idfTests<-rep(T, ncol(idf))
for (i in 1:ncol(idf)){
#This basically makes a histogram/table for each unique value
#It then grabs the most common value and checks if it’s less common than
#the incidence rate. If it is, we keep it.
idfTests[i]<-max(table(idf[,i]))/length(idf[,i])<=cutoff
}
return(idf[,idfTests])
}
df<-idfCleanData(df, 0.9999)

str(df)

#train/test split
set.seed(69412)
index<-sample(1:nrow(df), floor(0.75*nrow(df)), replace=F)
dfTrain<-df[index, ]
dfTest<-df[-index,]
dfTestAnswers<-dfTest$Determination
dfTest<-subset(dfTest, select=-Determination)

rfTest<-randomForest(Determination~., df)
rfPrediction<-predict(rfTest, dfTest)
confusionMatrix(rfPrediction, dfTestAnswers)
table(dfTestAnswers)
saveRDS(rfTest, “IMRRandomForest.rda”)

Leave a Reply

Your email address will not be published. Required fields are marked *