The project presents a proof of concept of a Fraud Detection Analytics Solution. The purpose of the project to demonstrate how the potential fraud can be detected based on the mockup data set reflecting the medical records and generated with mockaroo.com and also include the public datasets from kaggle.com.
- mockaroo.com - lets you generate up to 1,000 rows of realistic test data in CSV, JSON, SQL, and Excel formats
- kaggle.com - Access a huge repository of community published data & code
y = 0.015x + 12.757
R² = 0.8291
r=0.910549285
Strong Positive Correlation
> DPvsProv<-read.csv("DPvsProv.csv", header=TRUE)
> DPvsProv
PROVIDER_ID PROV_CLM_COUNT D_FLAG
1 0 1421 146
2 3 46 0
3 11 36256 964
4 17 10119 223
plot(DPvsProv$PROV_CLM_COUNT, DPvsProv$D_FLAG, xlab = "Total number of claims per provider", ylab = "number of double-payments per provider", pch = 16, cex = 1.3, col = "blue")
with(DPvsProv,plot(DPvsProv$PROV_CLM_COUNT, DPvsProv$D_FLAG, xlab = "Total number of claims per provider", ylab = "number of double-payments per provider", pch = 16, cex = 1.3, col = "blue"))
y = 0.0085x - 51.938
R² = 0.23
r=0.479583152
Weak Positive Correlation
ICDsvsProv<-read.csv("ICDsvsProv.csv", header=TRUE)
ICDsvsProv
PROVIDER_ID PROV_CLM_COUNT WRONG_ICD
1 0 1421 2
2 3 46 0
3 11 36256 145
4 17 10119 9
5 25 26712 42
6 45 843 0
7 56 128418 5173
plot(ICDsvsProv$PROV_CLM_COUNT, ICDsvsProv$WRONG_ICD, xlab = "Total number of claims per provider", ylab = "Number of Wrong ICDs per Provider", pch = 16, cex = 1.3, col = "blue")
with(ICDsvsProv,plot(ICDsvsProv$PROV_CLM_COUNT, ICDsvsProv$WRONG_ICD, xlab = "Total number of claims per provider", ylab = "Number of Wrong ICDs per Provider", pch = 16, cex = 1.3, col = "blue"))
y = 0.0172x - 47.984
R² = 0.9147
r=0.956399498
Strong Positive Correlation
DPvsLoc<-read.csv("DPvsLoc.csv", header=TRUE)
DPvsLoc
PROV_LOCATION_ID LOC_CLM_COUNT D_FLAG
1 1 329879 4809
2 2 268034 3779
3 3 283200 6040
4 4 107953 1925
5 5 94588 1607
6 6 582 18
7 7 173951 1200
plot(DPvsLoc$LOC_CLM_COUNT, DPvsLoc$D_FLAG, xlab = "Total number of claims per location", ylab = "number of double-payments per location", pch = 16, cex = 1.3, col = "blue")
with(DPvsLoc,plot(DPvsLoc$LOC_CLM_COUNT, DPvsLoc$D_FLAG, xlab = "Total number of claims per location", ylab = "number of double-payments per location", pch = 16, cex = 1.3, col = "blue"))
y = 0.0043x - 34.453
R² = 0.8262
r=0.908955444
Strong Positive Correlation
ICDsVSLoc<-read.csv("ICDsVSLoc.csv", header=TRUE)
ICDsVSLoc
location_ID LOC_CLM_COUNT WRONG_ICD
PROV_LOCATION_ID LOC_CLM_COUNT WRONG_ICD
1 1 329879 1205
2 2 268034 1208
3 3 283200 1023
4 4 107953 814
plot(ICDsVSLoc$LOC_CLM_COUNT, ICDsVSLoc$WRONG_ICD, xlab = "Total number of claims per location", ylab = "Number of Wrong ICDs per location", pch = 16, cex = 1.3, col = "blue")
with(ICDsVSLoc,plot(ICDsVSLoc$LOC_CLM_COUNT, ICDsVSLoc$WRONG_ICD, xlab = "Total number of claims per location", ylab = "Number of Wrong ICDs per location", pch = 16, cex = 1.3, col = "blue"))