load.pac <- function() {
if(require("pacman")){
library(pacman)
}else{
install.packages("pacman")
library(pacman)
}
pacman::p_load(xts, sp, gstat, ggplot2, rmarkdown, reshape2, ggmap, wesanderson,
parallel, dplyr, plotly, tidyverse, reticulate, UsingR, Rmpfr, latex2exp,
mise, GGally, usmap)
# devtools::install_github("tidyverse/tidyverse")
}
load.pac()
## Loading required package: pacman
mise()
## Load Data
(adult <- read.csv(file = "./10_data/AdultDataSet/adult.data")) %>% head()
(aus <- read.table(file = "./10_data/Australian/australian.dat", header = TRUE)) %>% head()
(mob <- read.csv(file = "./10_data/Datamobile Data Set/imports-85.data")) %>% head()
(house <- read.table(file = "./10_data/Housing/housing.data", header = TRUE) ) %>% head()
First investigate the Features
names(adult)
## [1] "Age" "Workclass" "Wage"
## [4] "Education" "Years.of.Education" "Marital.Status"
## [7] "Occupation" "Relationship" "Race"
## [10] "Sex" "Capital.Gain" "Capital.Loss"
## [13] "Hours.Per.Week" "Native.Country" "Income.Threshold"
Now plot the data relating to Income Threshold:
ggplot(adult, aes(x = Hours.Per.Week, y = Income.Threshold, fill = Sex)) +
geom_boxplot() +
labs(x = "Hours per Week", y = "Income Threshold", title = "Income Threshold Given Hours per Week Across Genders")
this demonstrates that men tend to work longer hours and women need to work longer hours in order to earn above $50K.
ggplot(adult, aes(y = Income.Threshold, fill = Sex, x = Wage)) +
geom_boxplot()
This demonstrates that wage has no bearing on Income Threshold, which is likely not true, there could be an issue with the data.
ggplot(adult, aes(x = Occupation, fill = Occupation, y = Wage)) +
geom_col(position = 'dodge') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
guides(fill = FALSE) +
labs(title = "Occupation and Salary")
ggplot(adult, aes(x = Education, fill = Education, y = Wage)) +
geom_col(position = 'dodge') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
guides(fill = FALSE) +
labs(title = "Education and Salary")
This plot suggests that the less college you complete, the more money you will earn, again, there may be something awry with this data.
This data set has non descriptive features, for visual analytics this might be better because it allows for analysis without prejudice.
The best thing to do is to use a correlation plot in order to determine which features are the most significant:
library(corrplot)
## corrplot 0.84 loaded
head(aus)
cor(aus) %>% corrplot(method = 'ellipse', type = 'upper')
from this we can tell that X0.3
and X0
are highly correlated, as are X0.1
and X0.2
, there also appers to be weak negative correlation between x100
and X11.46
, these can be represented by a scatter plot to better understand the behaviour, for example:
ggplot(aus, aes(x = X100, y = X11.46, col = factor(X1))) +
geom_point() +
guides(col = guide_legend(TeX("X_1"))) +
labs(x = TeX("X_{100}"), y = TeX("X_{11.46}"))
The data can be inspected using str
:
str(mob)
## 'data.frame': 205 obs. of 26 variables:
## $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
## $ normalized.losses: chr "?" "?" "?" "164" ...
## $ make : chr "alfa-romero" "alfa-romero" "alfa-romero" "audi" ...
## $ fuel.type : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ num.of.doors : chr "two" "two" "two" "four" ...
## $ body.style : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drive.wheels : chr "rwd" "rwd" "rwd" "fwd" ...
## $ engine.location : chr "front" "front" "front" "front" ...
## $ wheel.base : num 88.6 88.6 94.5 99.8 99.4 ...
## $ length : num 169 169 171 177 177 ...
## $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curb.weight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ engine.type : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ num.of.cylinders : chr "four" "four" "six" "four" ...
## $ engine.size : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuel.system : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ bore : chr "3.47" "3.47" "2.68" "3.19" ...
## $ stroke : chr "2.68" "2.68" "3.47" "3.40" ...
## $ compression.ratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : chr "111" "111" "154" "102" ...
## $ peak.rpm : chr "5000" "5000" "5000" "5500" ...
## $ city.mpg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highway.mpg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : chr "13495" "16500" "16500" "13950" ...
This reveals the data are car metrics, we can then visualise different aspects of the data that might be interesting:
mob$price <- as.numeric(mob$price)
## Warning: NAs introduced by coercion
ggplot(mob, aes(y = price, x = make, fill = make)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
guides(fill = FALSE)
## Warning: Removed 4 rows containing missing values (position_stack).
This provides that Toyota Mercedes and BMW produce some of the most expensive vehicles.
mob$horsepower <- as.numeric(mob$horsepower)
## Warning: NAs introduced by coercion
ggplot(mob, aes(x = city.mpg, y = horsepower/curb.weight, col = price)) +
geom_point() +
theme_bw() +
labs(x = TeX("Mileage (\\frac{mi}{gl})"), y = TeX("Power to Weight (\\frac{hp}{lbs})"),
title = "Vehicle Performance and Fuel Consumption")
## Warning: Removed 2 rows containing missing values (geom_point).
This indicates that high performance (High Power to Weight) vehicles tend to consume more fuel, this fuel change in fuel consumption appears to scale linearly.
The price appears not to change significantly accross fuel consumption or power to weight.
The housing data can be inspected using the str
function and the information relating to the data set can be found here
str(house)
## 'data.frame': 506 obs. of 14 variables:
## $ CRIM : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ ZN : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ CHAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ RM : num 6.58 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : num 296 242 242 222 222 222 311 311 311 311 ...
## $ PTRATIO: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ B : num 397 397 393 395 397 ...
## $ LSTAT : num 4.98 9.14 4.03 2.94 5.33 ...
## $ MEDV : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
In order to get an understanding of the relationship between the data a scatter plot matrix can be used:
library(GGally)
cor(house) %>% corrplot(method = "ellipse", type = 'upper')
This suggests that the median value of the house is signifanctly impacted by the number of rooms RM
the lower status of the population LSTAT
and the contentration of nitrous oxides in the air NOX
, this could be visualised by a scatterplot:
ggplot(house, aes(x = NOX, y = MEDV, shape = factor(round(RM)), col = LSTAT)) +
geom_point() +
theme_bw() +
labs(x = TeX("Nitrogen Oxides Concentration (10^{-1} ppm)"),
y = "Median House Value",
title = " Housing Prices")
library("readxl")
(store <- read_excel("./10_data/10_data_Superstore.xls")) %>% str()
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L2236 / R2236C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L5276 / R5276C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L8800 / R8800C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9148 / R9148C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9149 / R9149C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9150 / R9150C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9388 / R9388C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9389 / R9389C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9390 / R9390C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9391 / R9391C12: '05408'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Coercing text to numeric in L9743 / R9743C12: '05408'
## tibble [9,994 × 21] (S3: tbl_df/tbl/data.frame)
## $ Row ID : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
## $ Order ID : chr [1:9994] "CA-2013-152156" "CA-2013-152156" "CA-2013-138688" "US-2012-108966" ...
## $ Order Date : POSIXct[1:9994], format: "2013-11-09" "2013-11-09" ...
## $ Ship Date : POSIXct[1:9994], format: "2013-11-12" "2013-11-12" ...
## $ Ship Mode : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer ID : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Customer Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
## $ Segment : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Country : chr [1:9994] "United States" "United States" "United States" "United States" ...
## $ City : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
## $ State : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
## $ Postal Code : num [1:9994] 42420 42420 90036 33311 33311 ...
## $ Region : chr [1:9994] "South" "South" "West" "South" ...
## $ Product ID : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub-Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...
names(store)
## [1] "Row ID" "Order ID" "Order Date" "Ship Date"
## [5] "Ship Mode" "Customer ID" "Customer Name" "Segment"
## [9] "Country" "City" "State" "Postal Code"
## [13] "Region" "Product ID" "Category" "Sub-Category"
## [17] "Product Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
A plot of Sales vs Quantity can be produced:
store$Quantity <- factor(store$Quantity, levels = 1:max(store$Quantity), ordered = TRUE)
p <- ggplot(data = store, aes(x = factor(Quantity), y = Profit, col = Quantity))
p + geom_boxplot() +
guides(col = FALSE) +
labs(x = "Quantity Units", title = "Profit Given Quantity sold")
This suggests that 4/5 units of quantity produce the most variation in profit, for this reason it can be anticipated that higher quantities may be preferred because the profits will be more consistent.
names(store)
## [1] "Row ID" "Order ID" "Order Date" "Ship Date"
## [5] "Ship Mode" "Customer ID" "Customer Name" "Segment"
## [9] "Country" "City" "State" "Postal Code"
## [13] "Region" "Product ID" "Category" "Sub-Category"
## [17] "Product Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
ggplot(store, aes(x = "", y = Profit, fill = Region)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y", start = 0)
This shows that the South Region earns the most profits.
Yeah, this is actually wrong, I’m not sure why TBH, I’m going to have to investigate Pie Charts in
ggplot2
, I didn’t realise it would be so poorly implemented.
ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
geom_col(position = 'dodge')
This indicates that most sales are made in the South and Central Regions and that the shipping method that produces the most profits is Standard Class.
ggplot(data = store, aes(x = State, y = Profit, fill = Segment)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
This provides that New York and and Arkansas were the states that had the biggest impact on profits, generally consumer purchases generate the most profits.
ggplot(data = store, aes(x = Discount, y = Quantity)) +
geom_col(col = "royalblue")
This suggests that the largest discount is given for 2/3 items where as purchasing many items does not often involve a discount.
This is as far as I got, the issue is that I’d need to convert the data to have lat/long or standard names which would take a bit of research, whereas Tablaeu can interpret US names in a way that hasn’t been implemented in ggplot2
.
library(usmap)
library(ggplot2)
plot_usmap(data = statepop, values = "pop_2015", color = "red") +
scale_fill_continuous(name = "Population (2015)", label = scales::comma) +
theme(legend.position = "right")
## Warning: Use of `map_df$x` is discouraged. Use `x` instead.
## Warning: Use of `map_df$y` is discouraged. Use `y` instead.
## Warning: Use of `map_df$group` is discouraged. Use `group` instead.
Different colour pallettes could have been used in these visualisations, for example:
ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
geom_col(position = 'dodge') +
scale_fill_brewer(palette = 'Dark2')
pt <- c("#AE60B8", "#ED2B78", "#B8E3B8", "#FA9F7B")
ggplot(store, aes(x = Region, y = Sales, fill = `Ship Mode`)) +
geom_col(position = 'dodge') +
scale_fill_manual(values = pt)
ggplot(house, aes(x = NOX, y = MEDV, shape = factor(round(RM)), col = LSTAT)) +
geom_point() +
theme_bw() +
labs(x = TeX("Nitrogen Oxides Concentration (10^{-1} ppm)"),
y = "Median House Value",
title = " Housing Prices") +
scale_colour_gradient(low = "green", high = "darkblue")