# packages library(robustbase) # set dir. setwd("D:/telhado") # read data data <- read.table("data/telhado.txt", header=TRUE) # summary summary(data) round(sd(data$telhados), 2) round(sd(data$nclientes), 2) # boxplot par(mar=c(5.5,5.5,2,2), mfrow=c(1,2)) adjbox(data$telhados, ylab="Telhados vendidos", cex.lab=1.5, cex.axis=1.2) plot(density(data$telhados), xlab="Telhados vendidos", ylab="Densidade", main="", cex.lab=1.5, cex.axis=1.2, lwd=2) # dispersion par(mar=c(5.5,5.5,2,2), mfrow=c(1,1)) plot(data$nclientes, data$telhados, xlab="Clientes cadastrados", ylab="Telhados vendidos", pch=16, cex.lab=1.5, cex.axis=1.2) lines(smooth.spline(data$nclientes, data$telhados, df=3), lwd=2, col="red") # model fit1 <- lm(telhados ~ nclientes, data) summary(fit1) # coefs round(fit1$coefficients, 2) round(vcov(fit1), 2) # adjust plot(data$nclientes, data$telhados, xlab="Clientes cadastrados", ylab="Telhados vendidos", pch=16, cex.lab=1.5, cex.axis=1.2) abline(fit1, lwd=2) # diag envel_norm <- function(fit.model, nsim, prob){ # residuo studentizado X <- model.matrix(fit.model) n <- nrow(X) p <- ncol(X) H <- X%*%solve(t(X)%*%X)%*%t(X) h <- diag(H) si <- lm.influence(fit.model)$sigma r <- resid(fit.model) tsi <- r/(si*sqrt(1-h)) # prep ident <- diag(n) epsilon <- matrix(0,n,nsim) e <- matrix(0,n,nsim) e1 <- numeric(n) e2 <- numeric(n) # sim for(i in 1:nsim){ epsilon[,i] <- rnorm(n,0,1) e[,i] <- (ident - H)%*%epsilon[,i] u <- diag(ident - H) e[,i] <- e[,i]/sqrt(u) e[,i] <- sort(e[,i]) } # band for(i in 1:n){ eo <- sort(e[i,]) e1[i] <- quantile(eo, probs=((1-prob)/2), na.rm=FALSE, names=TRUE, type=1) e2[i] <- quantile(eo, probs=((1+prob)/2), na.rm=FALSE, names=TRUE, type=1) } # plot par(mar=c(5.5,5.5,2,2), mfrow=c(1,2)) plot(predict(fit.model), tsi, xlab="Valor ajustado", ylab="Residuo studentizado", pch=16, main="", cex.lab=1.5, cex.axis=1.2) # plot med <- apply(e,1,mean) faixa <- range(tsi,e1,e2) qqnorm(tsi, xlab="Quantil da N(0,1)", ylab="Residuo studentizado", ylim=faixa, pch=16, main="", cex.lab=1.5, cex.axis=1.2) par(new=TRUE) qqnorm(e1, axes=FALSE, xlab="", ylab="", type="l", ylim=faixa, main="") par(new=TRUE) qqnorm(e2, axes=FALSE, xlab="", ylab="", type="l", ylim=faixa, main="") par(new=TRUE) qqnorm(med, axes=FALSE, xlab="", ylab="", type="l", ylim=faixa, lty=2, main="") } envel_norm(fit1, 1000, 0.95) diag_cook_norm <- function(fit.model, ident){ # dist. Cook X <- model.matrix(fit.model) n <- nrow(X) p <- ncol(X) H <- X%*%solve(t(X)%*%X)%*%t(X) h <- diag(H) r <- resid(fit.model) s <- sqrt(sum(r*r)/(n-p)) ts <- r/(s*sqrt(1-h)) di <- (1/p)*(h/(1-h))*(ts^2) # plot par(mar=c(5.5,5.5,2,2), mfrow=c(1,1)) a <- max(di) b <- min(di) plot(di, xlab="Índice", ylab="Distância de Cook", pch=16, ylim=c(b, a+0.1), cex.lab=1.5, cex.axis=1.2, main="") cut <- 2*sd(di) + mean(di) abline(cut,0, lty=2, lwd=2) identify(di, n=ident, cex=1.2) } diag_cook_norm(fit1, 2) # confirmatory fit1.1 <- lm(telhados ~ nclientes, data[-6,]) fit1.2 <- lm(telhados ~ nclientes, data[-10,]) plot(data$nclientes, data$telhados, xlab="Clientes cadastrados", ylab="Telhados vendidos", pch=16, cex.lab=1.5, cex.axis=1.2) identify(data$nclientes, data$telhados, n=2, cex=1.2) abline(fit1, lwd=2) abline(fit1.1, lwd=2, col=2) abline(fit1.2, lwd=2, col=3) # confidence band z <- seq(25, 76, 0.1) reta <- cbind(1,z)%*%fit1$coefficients Sxx <- sum((data$nclientes - mean(data$nclientes))^2) hz <- ((z - mean(data$nclientes))^2)/Sxx delta1 <- sigma(fit1)*qt(0.975, 24)*sqrt(1/26 + hz) par(mar=c(5.5,5.5,2,2), mfrow=c(1,2)) plot(data$nclientes, data$telhados, ylab="Telhados vendidos", ylim=c(-100,450), xlab="Clientes cadastrados", type="n", cex.lab=1.5, cex.axis=1.2, lwd=2) LI <- reta - delta1 LS <- reta + delta1 M <- reta lines(z, LI, lty=2, lwd=2) lines(z, LS, lty=2, lwd=2) lines(z, M, lwd=2) delta2 <- sigma(fit1)*qt(0.975, 24)*sqrt(1 + 1/26 + hz) plot(data$nclientes, data$telhados, ylab="Telhados vendidos", ylim=c(-100,450), xlab="Clientes cadastrados", type="n", cex.lab=1.5, cex.axis=1.2, lwd=2) points(data$nclientes, data$telhados, pch=16, cex=1, lwd=2) LI <- reta - delta2 LS <- reta + delta2 M <- reta lines(z, LI, lty=2, lwd=2) lines(z, LS, lty=2, lwd=2) lines(z, M, lwd=2)