-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathGraphs-Advanced.R
More file actions
101 lines (79 loc) · 4.46 KB
/
Graphs-Advanced.R
File metadata and controls
101 lines (79 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# "ggplot2" package
#If using for first time, uncomment this line to install "ggplot2" package
#install.packages("ggplot2")
library(ggplot2)
# Mapping "displ - displacement" and "cty - city mileage" attributes using ggplot
# Differentiating points based on "manufacturer" attribute
ggplot(mpg, aes(displ, cty, colour = manufacturer)) + geom_point()
##############################################################################################
# Individual data visualization of the dataset based on an attribute -> facet_wrap(attr)
ggplot(mpg, aes(displ, cty)) + geom_point() + facet_wrap("class")
# Data visulaization in different formats
#geom_smooth() -> computes conditional mean using LO(W)ESS method -> discrete to continuous
# se = FALSE - hides the error shaded region around the curve
ggplot(mpg, aes(displ, cty)) + geom_smooth(se = FALSE) + facet_wrap("class")
ggplot(mpg, aes(displ, cty)) + geom_boxplot() + facet_wrap("class")
#geom_smooth by default takes all data points into consideration to fit the curve
#if span value is lesser than 1, it takes that much proportion of nearest values
ggplot(mpg, aes(displ, cty)) + geom_smooth(span = 1)
ggplot(mpg, aes(displ, cty)) + geom_smooth(span = 0.2)
##############################################################################################
# geom_jitter ; geom_violin -> To avoid overplotting
ggplot(mpg, aes(drv, hwy)) + geom_point()
#Adds some random noise and more apt for smaller dataset
ggplot(mpg, aes(drv, hwy)) + geom_jitter()
#Calculates density estimate of distribution
ggplot(mpg, aes(drv, hwy)) + geom_violin()
##############################################################################################
# Histogram and Frequency polygon used to study one attribute in detail
ggplot(mtcars,aes(cyl)) + geom_histogram(bins = 5)
ggplot(mtcars,aes(cyl)) + geom_freqpoly(bins = 5)
# Histogram and Frequency polygon of an attribute differentiated with repsect to another attribute
ggplot(mpg, aes(displ, colour = drv)) + geom_freqpoly(bins = 5)
ggplot(mpg, aes(displ, fill = drv)) + geom_histogram(bins = 5) + facet_wrap("drv", ncol = 1)
##############################################################################################
# Bar charts
marks = data.frame(stud = c("SK7", "CR7", "Messi"), mark = c(98, 90, 100))
print(marks)
# Use stat = "identity" to represent value of data rather than count
# Need 2 aesthetic variables
ggplot(marks, aes(stud, mark)) + geom_bar(stat = "identity") + geom_point()
ggplot(marks, aes(stud, mark)) + geom_point()
# Time series
ggplot(economics, aes(date, uempmed)) + geom_line()
##############################################################################################
# Basic Plot types using ggplot
sample_text = data.frame(x = c(1,2,3), y = c(10,15,35), label = c("A","B","C"))
print(sample_text)
sample_plot = ggplot(sample_text, aes(x, y, label = label)) + labs(x = "ID", y = "Value")
print(sample_plot)
sample_plot+geom_point()+ggtitle("Point")
# Plots the value by representing as labels
sample_plot+geom_text()+ggtitle("Text")
sample_plot+geom_bar(stat = "identity")+ggtitle("Bar chart")
sample_plot+geom_polygon()+ggtitle("Polygon")
sample_plot+geom_raster()+ggtitle("Raster")
##############################################################################################
# Usecase - highlighting specific portion of data using conditons in the plot
#Using ggalt for first time, install the package
#install.packages("ggalt")
library(ggalt)
# Filtering data from midwest dataset based on poptotal and area constraints
midwest_select = midwest[midwest$poptotal > 350000 & midwest$poptotal < 500000 &
midwest$area > 0.01 & midwest$area < 0.1, ]
print(midwest_select)
# Plotting
ggplot(midwest, aes(x = area, y = poptotal)) + geom_smooth() +
#Differentiating points based on "popdensity" and "State"
geom_point(aes(color = state, size = popdensity)) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
geom_encircle(aes(x = area, y = poptotal),
data = midwest_select,
color = "red",
size = 4,
expand = 0.1) +
labs(x = "Area", y = "Population",
title = "Area vs Population in Midwest",
caption = "Source: midwest dataset")
##############################################################################################