Example to work on the Boston Housing Dataset

library(MASS)
View(Boston) # view the dataset
?Boston # get information on the variables
## starting httpd help server ... done
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

Fit a basic linear model to your data, print the summary and plot it

fit1 = lm(medvāˆ¼lstat,data=Boston)
summary(fit1)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
plot(x=Boston$lstat,y=Boston$medv,
     main="Boston House Prices vs. Population Status",
     xlab = 'Lower Status of the Population',
     ylab = 'Median House Value')

# Plot the regression line
abline(fit1)

You can also add more variables and do a variable transformation

# Multivariable Regression
fit2 = lm(medvāˆ¼lstat+tax,data=Boston)
summary(fit2)
## 
## Call:
## lm(formula = medv ~ lstat + tax, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.553  -4.034  -1.384   1.983  26.053 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 35.835001   0.735532  48.720  < 2e-16 ***
## lstat       -0.883127   0.045880 -19.248  < 2e-16 ***
## tax         -0.005212   0.001944  -2.681  0.00757 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.178 on 503 degrees of freedom
## Multiple R-squared:  0.5506, Adjusted R-squared:  0.5488 
## F-statistic: 308.1 on 2 and 503 DF,  p-value: < 2.2e-16
# Variable transformation
Boston$rmsq = Boston$rm**2

fit3 = lm(medv~Boston$rmsq,data=Boston)

plot(x=Boston$rm,y=Boston$medv,
     main="Boston House Prices vs. Rooms",
     xlab = 'Rooms',
     ylab = 'Median House Value')

# Plot the regression line
xvec <- seq(0,10,length=10)

curve.dat = data.frame(x=Boston$rm, y=predict(fit3))
curve.dat = curve.dat[order(curve.dat$x),]
lines(curve.dat, col=4)  

Now experiment with different variables, and variable transformations. See how high of an \(R^2\) value you can get.

print(summary(fit3)$r.squared)
## [1] 0.5157671