install.packages("caTools")
pacman::p_load(openxlsx,pacman,tidyverse,caTools)
data<-read.xlsx("heart.xlsx")
View(data)
dir()
file.show("heart_disease-data-sets-name-for logistic-regression.txt")
str(data)
data$sex<-ifelse(data$sex == 0, "f","m")#0 becomes female and 1 becomes male
data$hd<-ifelse(data$hd == 0, "healthy", "unhealthy")#0 becomes healthy and 1 becomes unhealthy
data$hd<-as.factor(data$hd)
data$sex<-as.factor(data$sex)
data$cp<-as.factor(data$cp)
data$ca<-as.factor(data$ca)
data$thal<-as.factor(data$thal)
data$restecg<-as.factor(data$restecg)
data$exang<-as.factor(data$exang)
data$slope<-as.factor(data$slope)
str(data)
data$fbs<-as.factor(data$fbs)
#Existence dependency####
xtabs(~hd+sex,data = data)#Finds the dependency between hd and sex by showing the no of unhealthy and healthy people based on sex
xtabs(~hd+ca,data = data)
data1<-data
#Train the model and predict####
nrow(data)
split<-sample.split(data,SplitRatio = 0.7)#Splits data from 'data' into 2 sets based on given ratio and preserves relative ratios in data
print(split)
dtrain<-subset(data,split == "TRUE")#This data is used to train the model
nrow(dtrain)
dtest<-subset(data,split == "FALSE")#This data is used to test the model
nrow(dtest)
View(dtrain)
View(dtest)
#Model creation
model<-glm(hd~.,data = dtrain,family = "binomial")#Fits the hd column with all other columns linearly by taking data from dtrain based on binomial distribution
summary(model)#Displays all quartiles and errors with deviances
predict<-predict(model,dtest,type = "response")#Predicts the no of healthy and unhealthy people based on 'model'
predict
class(predict)
#Matrix creation####
comatrix<-table(actual = dtest$hd, predicted = predict > 0.5)#Creates a matrix based on predicted and actual results of the model on people's health
colnames(comatrix)<-c("healthy","unhealthy")
comatrix
dtest%>%filter(hd =="healthy")%>%nrow()#Gives no of healthy people in testing data
dtest%>%filter(hd =="unhealthy")%>%nrow()#Gives no of unhealthy people in testing data
comatrix[1,1]
accuracy<-(comatrix[1,1]+comatrix[2,2])/sum(comatrix)#Gives the accuracy or succes rate of the trained and tested model
accuracy
#HW####
#Phishing data:make result column more than 5 = 1 and rest is 0
predict
predict<-round(predict)#Rounds off the values in prediction
dtest$predict<-predict
dtest$predict<-ifelse(dtest$predict == 0,"healthy","unhealthy")#0 is healthy and others are unhealthy
dtest%>%filter(predict == "unhealthy")->call#All people that were predicted as unhealthy are stored in 'call'
View(call)