Preamble

Load Packages

load.pac <- function() {
  
  if(require("pacman")){
    library(pacman)
  }else{
    install.packages("pacman")
    library(pacman)
  }
  
  pacman::p_load(xts, sp, gstat, ggplot2, rmarkdown, reshape2, ggmap, wesanderson,
                 parallel, dplyr, plotly, tidyverse, reticulate, UsingR, Rmpfr, latex2exp,
                 mise, GGally, usmap)
  
#  devtools::install_github("tidyverse/tidyverse")
}

load.pac()

## Loading required package: pacman

mise()

## Load Data

(adult  <- read.csv(file = "./10_data/AdultDataSet/adult.data")) %>%  head()

(aus    <- read.table(file = "./10_data/Australian/australian.dat",  header = TRUE)) %>%  head()

(mob   <- read.csv(file = "./10_data/Datamobile Data Set/imports-85.data")) %>% head()

(house <- read.table(file = "./10_data/Housing/housing.data", header = TRUE) ) %>% head()

Question 1 and 2

Adult Data

First investigate the Features

names(adult)

##  [1] "Age"                "Workclass"          "Wage"              
##  [4] "Education"          "Years.of.Education" "Marital.Status"    
##  [7] "Occupation"         "Relationship"       "Race"              
## [10] "Sex"                "Capital.Gain"       "Capital.Loss"      
## [13] "Hours.Per.Week"     "Native.Country"     "Income.Threshold"

Now plot the data relating to Income Threshold:

ggplot(adult, aes(x = Hours.Per.Week, y = Income.Threshold, fill = Sex)) +
  geom_boxplot() + 
  labs(x = "Hours per Week", y = "Income Threshold", title = "Income Threshold Given Hours per Week Across Genders")

this demonstrates that men tend to work longer hours and women need to work longer hours in order to earn above $50K.

ggplot(adult, aes(y = Income.Threshold, fill = Sex, x = Wage)) +
  geom_boxplot()

This demonstrates that wage has no bearing on Income Threshold, which is likely not true, there could be an issue with the data.

ggplot(adult, aes(x = Occupation, fill = Occupation, y = Wage)) +
  geom_col(position = 'dodge') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  guides(fill = FALSE) +
  labs(title = "Occupation and Salary")

ggplot(adult, aes(x = Education, fill = Education, y = Wage)) +
  geom_col(position = 'dodge') +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  guides(fill = FALSE) + 
  labs(title = "Education and Salary")

This plot suggests that the less college you complete, the more money you will earn, again, there may be something awry with this data.

Aus Data

This data set has non descriptive features, for visual analytics this might be better because it allows for analysis without prejudice.

The best thing to do is to use a correlation plot in order to determine which features are the most significant:

library(corrplot)

## corrplot 0.84 loaded

head(aus)

cor(aus) %>% corrplot(method = 'ellipse', type = 'upper')

from this we can tell that X0.3 and X0 are highly correlated, as are X0.1 and X0.2, there also appers to be weak negative correlation between x100 and X11.46, these can be represented by a scatter plot to better understand the behaviour, for example:

ggplot(aus, aes(x = X100, y = X11.46, col = factor(X1))) +
  geom_point() +
  guides(col = guide_legend(TeX("X_1"))) +
  labs(x = TeX("X_{100}"), y = TeX("X_{11.46}"))

DataMobile

The data can be inspected using str:

str(mob)

## 'data.frame':    205 obs. of  26 variables:
##  $ symboling        : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ normalized.losses: chr  "?" "?" "?" "164" ...
##  $ make             : chr  "alfa-romero" "alfa-romero" "alfa-romero" "audi" ...
##  $ fuel.type        : chr  "gas" "gas" "gas" "gas" ...
##  $ aspiration       : chr  "std" "std" "std" "std" ...
##  $ num.of.doors     : chr  "two" "two" "two" "four" ...
##  $ body.style       : chr  "convertible" "convertible" "hatchback" "sedan" ...
##  $ drive.wheels     : chr  "rwd" "rwd" "rwd" "fwd" ...
##  $ engine.location  : chr  "front" "front" "front" "front" ...
##  $ wheel.base       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ length           : num  169 169 171 177 177 ...
##  $ width            : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ height           : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curb.weight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ engine.type      : chr  "dohc" "dohc" "ohcv" "ohc" ...
##  $ num.of.cylinders : chr  "four" "four" "six" "four" ...
##  $ engine.size      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuel.system      : chr  "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ bore             : chr  "3.47" "3.47" "2.68" "3.19" ...
##  $ stroke           : chr  "2.68" "2.68" "3.47" "3.40" ...
##  $ compression.ratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower       : chr  "111" "111" "154" "102" ...
##  $ peak.rpm         : chr  "5000" "5000" "5000" "5500" ...
##  $ city.mpg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highway.mpg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price            : chr  "13495" "16500" "16500" "13950" ...

This reveals the data are car metrics, we can then visualise different aspects of the data that might be interesting:

Average Price per Manufacturer

mob$price <- as.numeric(mob$price)

## Warning: NAs introduced by coercion

ggplot(mob, aes(y = price, x = make, fill = make)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  guides(fill = FALSE)

## Warning: Removed 4 rows containing missing values (position_stack).

This provides that Toyota Mercedes and BMW produce some of the most expensive vehicles.

Vehicle Performance and Fuel Consumption

mob$horsepower <- as.numeric(mob$horsepower)

## Warning: NAs introduced by coercion

ggplot(mob, aes(x = city.mpg, y = horsepower/curb.weight, col = price)) +
  geom_point() +
  theme_bw() +
  labs(x = TeX("Mileage (\\frac{mi}{gl})"), y = TeX("Power to Weight (\\frac{hp}{lbs})"),
       title = "Vehicle Performance and Fuel Consumption")

## Warning: Removed 2 rows containing missing values (geom_point).

This indicates that high performance (High Power to Weight) vehicles tend to consume more fuel, this fuel change in fuel consumption appears to scale linearly.

The price appears not to change significantly accross fuel consumption or power to weight.

House Data

The housing data can be inspected using the str function and the information relating to the data set can be found here

str(house)

## 'data.frame':    506 obs. of  14 variables:
##  $ CRIM   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOX    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RM     : num  6.58 6.42 7.18 7 7.15 ...
##  $ AGE    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ DIS    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ RAD    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ TAX    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ PTRATIO: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ B      : num  397 397 393 395 397 ...
##  $ LSTAT  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ MEDV   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

In order to get an understanding of the relationship between the data a scatter plot matrix can be used:

library(GGally)
cor(house) %>% corrplot(method = "ellipse", type = 'upper')

This suggests that the median value of the house is signifanctly impacted by the number of rooms RM the lower status of the population LSTAT and the contentration of nitrous oxides in the air NOX, this could be visualised by a scatterplot:

ggplot(house, aes(x = NOX, y = MEDV, shape = factor(round(RM)), col = LSTAT))  +
  geom_point() +
  theme_bw() +
  labs(x = TeX("Nitrogen Oxides Concentration (10^{-1} ppm)"),
       y = "Median House Value",
       title = " Housing Prices")

Question 3

library("readxl")
(store <- read_excel("./10_data/10_data_Superstore.xls")) %>% str()

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L2236 / R2236C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L5276 / R5276C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L8800 / R8800C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9148 / R9148C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9149 / R9149C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9150 / R9150C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9388 / R9388C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9389 / R9389C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9390 / R9390C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9391 / R9391C12: '05408'

## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9743 / R9743C12: '05408'

## tibble [9,994 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Row ID       : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Order ID     : chr [1:9994] "CA-2013-152156" "CA-2013-152156" "CA-2013-138688" "US-2012-108966" ...
##  $ Order Date   : POSIXct[1:9994], format: "2013-11-09" "2013-11-09" ...
##  $ Ship Date    : POSIXct[1:9994], format: "2013-11-12" "2013-11-12" ...
##  $ Ship Mode    : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer ID  : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ Customer Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
##  $ Segment      : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
##  $ Country      : chr [1:9994] "United States" "United States" "United States" "United States" ...
##  $ City         : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal Code  : num [1:9994] 42420 42420 90036 33311 33311 ...
##  $ Region       : chr [1:9994] "South" "South" "West" "South" ...
##  $ Product ID   : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub-Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
##  $ Quantity     : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
##  $ Discount     : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
##  $ Profit       : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...

names(store)

##  [1] "Row ID"        "Order ID"      "Order Date"    "Ship Date"    
##  [5] "Ship Mode"     "Customer ID"   "Customer Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal Code"  
## [13] "Region"        "Product ID"    "Category"      "Sub-Category" 
## [17] "Product Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

Sales and Quantity

A plot of Sales vs Quantity can be produced:

store$Quantity <- factor(store$Quantity, levels = 1:max(store$Quantity), ordered = TRUE)
p <- ggplot(data = store, aes(x = factor(Quantity), y = Profit, col = Quantity))
 
  
  
  p + geom_boxplot() +
  guides(col = FALSE) +
  labs(x = "Quantity Units", title = "Profit Given Quantity sold")

This suggests that 4/5 units of quantity produce the most variation in profit, for this reason it can be anticipated that higher quantities may be preferred because the profits will be more consistent.

2 Post Code and Profits

names(store)

##  [1] "Row ID"        "Order ID"      "Order Date"    "Ship Date"    
##  [5] "Ship Mode"     "Customer ID"   "Customer Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal Code"  
## [13] "Region"        "Product ID"    "Category"      "Sub-Category" 
## [17] "Product Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

ggplot(store, aes(x = "", y = Profit, fill = Region)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y", start = 0)

This shows that the South Region earns the most profits.

Yeah, this is actually wrong, I’m not sure why TBH, I’m going to have to investigate Pie Charts in ggplot2, I didn’t realise it would be so poorly implemented.

3 “Region and Ship Mode”

ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
  geom_col(position = 'dodge')

This indicates that most sales are made in the South and Central Regions and that the shipping method that produces the most profits is Standard Class.

4 Segment and State

ggplot(data = store, aes(x = State, y = Profit, fill = Segment)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

This provides that New York and and Arkansas were the states that had the biggest impact on profits, generally consumer purchases generate the most profits.

5 Quantity and Discount

ggplot(data = store, aes(x = Discount, y = Quantity)) +
  geom_col(col = "royalblue")

This suggests that the largest discount is given for 2/3 items where as purchasing many items does not often involve a discount.

Trying to Map States

This is as far as I got, the issue is that I’d need to convert the data to have lat/long or standard names which would take a bit of research, whereas Tablaeu can interpret US names in a way that hasn’t been implemented in ggplot2.

library(usmap)
library(ggplot2)

plot_usmap(data = statepop, values = "pop_2015", color = "red") + 
  scale_fill_continuous(name = "Population (2015)", label = scales::comma) + 
  theme(legend.position = "right")

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

Question 4

Different colour pallettes could have been used in these visualisations, for example:

R Colour Brewer Discrete

ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
  geom_col(position = 'dodge') +
  scale_fill_brewer(palette = 'Dark2')

Custom Discrete Pallete

pt <- c("#AE60B8", "#ED2B78", "#B8E3B8", "#FA9F7B")
ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
  geom_col(position = 'dodge') +
  scale_fill_manual(values = pt)

Continuous Pallete

ggplot(house, aes(x = NOX, y = MEDV, shape = factor(round(RM)), col = LSTAT))  +
  geom_point() +
  theme_bw() +
  labs(x = TeX("Nitrogen Oxides Concentration (10^{-1} ppm)"),
       y = "Median House Value",
       title = " Housing Prices") +
  scale_colour_gradient(low = "green", high = "darkblue")

10 Tutorial