x1 <- c("Dec", "Apr", "Jan", "Mar")
Two issues:
sort(x1)#> [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c( "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
y1 <- factor(x1, levels = month_levels)y1#> [1] Dec Apr Jan Mar#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Decsort(y1)#> [1] Jan Mar Apr Dec#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
NA
:x2 <- c("Dec", "Apr", "Jam", "Mar")(y2 <- factor(x2, levels = month_levels))#> [1] Dec Apr <NA> Mar #> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
`forcats::fct()
y2 <- fct(x2, levels = month_levels)#> Error in `fct()`:#> ! All values of `x` must appear in `levels` or `na`#> ℹ Missing level: "Jam"
factor()
makes factors in alphabetical order:factor(x1)#> [1] Dec Apr Jan Mar#> Levels: Apr Dec Jan Mar
forcats::fct()
orders by first appearance in the vectorfct(x1)#> [1] Dec Apr Jan Mar#> Levels: Dec Apr Jan Mar
levels()
:levels(y2)#> [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
forcats::gss_cat
, loaded in tidyverse)gss_cat#> # A tibble: 21,483 × 9#> year marital age race rincome partyid #> <int> <fct> <int> <fct> <fct> <fct> #> 1 2000 Never married 26 White $8000 to 9999 Ind,near rep #> 2 2000 Divorced 48 White $8000 to 9999 Not str republican#> 3 2000 Widowed 67 White Not applicable Independent #> 4 2000 Never married 39 White Not applicable Ind,near rep #> 5 2000 Divorced 25 White Not applicable Not str democrat #> 6 2000 Married 25 White $20000 - 24999 Strong democrat #> # ℹ 21,477 more rows#> # ℹ 3 more variables: relig <fct>, denom <fct>, tvhours <int>
count
:gss_cat %>% count(race)
#> # A tibble: 3 × 2#> race n#> <fct> <int>#> 1 Other 1959#> 2 Black 3129#> 3 White 16395
rincome
(reported income)ggplot(gss_cat, aes(x = rincome)) + geom_bar()
rincome
(reported income)ggplot(gss_cat, aes(x = rincome)) + geom_bar()
ggplot(gss_cat, aes(y = rincome)) + geom_bar()
ggplot(gss_cat, aes(x = rincome)) + geom_bar()
ggplot(gss_cat, aes(x = rincome)) + geom_bar()
ggplot(gss_cat, aes(y = rincome)) + geom_bar()
gss_cat %>% count(relig) %>% arrange(n) %>% print(n=Inf)#> # A tibble: 15 × 2#> relig n#> <fct> <int>#> 1 Don't know 15#> 2 Native american 23#> 3 Other eastern 32#> 4 Hinduism 71#> 5 No answer 93#> 6 Orthodox-christian 95#> 7 Moslem/islam 104#> 8 Inter-nondenominational 109#> 9 Buddhism 147#> 10 Other 224#> 11 Jewish 388#> 12 Christian 689#> 13 None 3523#> 14 Catholic 5124#> 15 Protestant 10846
gss_cat %>% count(partyid) %>% arrange(n) %>% print(n=Inf)#> # A tibble: 10 × 2#> partyid n#> <fct> <int>#> 1 Don't know 1#> 2 No answer 154#> 3 Other party 393#> 4 Ind,near rep 1791#> 5 Strong republican 2314#> 6 Ind,near dem 2499#> 7 Not str republican 3032#> 8 Strong democrat 3490#> 9 Not str democrat 3690#> 10 Independent 4119
geom_freqpoly()
with color given by religion.ggplot(gss_cat, aes(x = tvhours, y = after_stat(density), color = marital)) + geom_freqpoly()
geom_freqpoly()
with color given by religion.ggplot(gss_cat, aes(x = tvhours, y = after_stat(density), color = marital)) + geom_freqpoly()
If we try the same thing for religion, it is trickier to visualize:
Let's look at TV hours watched per religion. Maybe we can first try doing a geom_freqpoly()
with color given by religion.
ggplot(gss_cat, aes(x = tvhours, y = after_stat(density), color = relig)) + geom_freqpoly()
relig_summary <- gss_cat |> group_by(relig) |> summarize( tvhours = mean(tvhours, na.rm = TRUE), n = n() )ggplot(relig_summary, aes(x = tvhours, y = relig)) + geom_point()
fact_reorder( f=, x=, fun=)
f
: factor whose levels to modifyx
: numeric vector to give the new order of levelsfun
: function to use if multiple values of x
for given value of f
(default: median
)ggplot(relig_summary, aes(x = tvhours, y = fct_reorder(relig, tvhours))) + geom_point()
gss_cat %>% count(partyid)#> # A tibble: 10 × 2#> partyid n#> <fct> <int>#> 1 No answer 154#> 2 Don't know 1#> 3 Other party 393#> 4 Strong republican 2314#> 5 Not str republican 3032#> 6 Ind,near rep 1791#> # ℹ 4 more rows
fct_recode()
gss_cat %>% mutate( partyid = fct_recode(partyid, "Republican, strong" = "Strong republican", "Republican, weak" = "Not str republican", "Independent, near rep" = "Ind,near rep", "Independent, near dem" = "Ind,near dem", "Democrat, weak" = "Not str democrat", "Democrat, strong" = "Strong democrat")) %>% count(partyid)#> # A tibble: 10 × 2#> partyid n#> <fct> <int>#> 1 No answer 154#> 2 Don't know 1#> 3 Other party 393#> 4 Republican, strong 2314#> 5 Republican, weak 3032#> 6 Independent, near rep 1791#> # ℹ 4 more rows
gss_cat |> mutate( partyid = fct_recode(partyid, "Republican, strong" = "Strong republican", "Republican, weak" = "Not str republican", "Independent, near rep" = "Ind,near rep", "Independent, near dem" = "Ind,near dem", "Democrat, weak" = "Not str democrat", "Democrat, strong" = "Strong democrat", "Other" = "No answer", "Other" = "Don't know", "Other" = "Other party" ) )
fct_collapse
:gss_cat |> mutate( partyid = fct_collapse(partyid, "other" = c("No answer", "Don't know", "Other party"), "rep" = c("Strong republican", "Not str republican"), "ind" = c("Ind,near rep", "Independent", "Ind,near dem"), "dem" = c("Not str democrat", "Strong democrat") ) ) |> count(partyid)#> # A tibble: 4 × 2#> partyid n#> <fct> <int>#> 1 other 548#> 2 rep 5346#> 3 ind 8409#> 4 dem 7180
weather
tibble in nycflights13:library(nycflights13)weather#> # A tibble: 26,115 × 15#> origin year month day hour temp dewp humid wind_dir wind_speed#> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>#> 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 #> 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06#> 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 #> 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 #> 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 #> 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 #> # ℹ 26,109 more rows#> # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>, …
weather2 <- weather %>% group_by(year, month, origin) %>% summarize(min_temp = min(temp, na.rm=TRUE), max_temp = max(temp, na.rm=TRUE), avg_temp = mean(temp, na.rm=TRUE)) %>% group_by(month, origin) %>% summarize(min_temp = mean(min_temp, na.rm=TRUE), max_temp = mean(max_temp, na.rm=TRUE), avg_temp = mean(avg_temp, na.rm=TRUE), n = n())
weather2 %>% filter(origin == 'EWR') %>% ggplot(aes(x = month)) + geom_line(aes(y = min_temp), color = 'blue') + geom_line(aes(y = max_temp), color = 'red') + geom_line(aes(y = avg_temp), color = 'black')
(weather3 <- weather2 %>% pivot_longer(cols = c(min_temp, max_temp, avg_temp), values_to = "temperature", names_to = "measurement"))#> # A tibble: 108 × 5#> # Groups: month [12]#> month origin n measurement temperature#> <int> <chr> <int> <chr> <dbl>#> 1 1 EWR 1 min_temp 10.9#> 2 1 EWR 1 max_temp 64.4#> 3 1 EWR 1 avg_temp 35.6#> 4 1 JFK 1 min_temp 12.0#> 5 1 JFK 1 max_temp 57.9#> 6 1 JFK 1 avg_temp 35.4#> # ℹ 102 more rows
ggplot(weather3 %>% filter(origin == 'EWR'), aes(x = month, y = temperature, color = measurement)) + geom_line()
ggplot(weather3 %>% filter(origin == 'EWR'), aes(x = month, y = temperature, color = measurement)) + geom_line(linewidth = 2) + scale_color_manual( values = c(min_temp = "blue", avg_temp = "black", max_temp = "red"), labels = c("min_temp" = "minimum", "avg_temp" = "average", "max_temp" = "maximum")) + scale_x_continuous(breaks = 1:12, minor_breaks = 1:12) + labs(title = "Temperature at EWR per month")
ggplot(weather3, aes(x = month, y = temperature, color = measurement)) + geom_line(linewidth = 2) + scale_color_manual( values = c(min_temp = "blue", avg_temp = "black", max_temp = "red"), labels = c("min_temp" = "minimum", "avg_temp" = "average", "max_temp" = "maximum")) + scale_x_continuous(breaks = 1:12, minor_breaks = 1:12) + facet_grid(. ~ origin)
weather %>% filter(month == 5, temp < 25)#> # A tibble: 1 × 15#> origin year month day hour temp dewp humid wind_dir wind_speed#> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>#> 1 JFK 2013 5 8 22 13.1 12.0 95.3 80 8.06#> # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,#> # visib <dbl>, time_hour <dttm>
weather %>% filter(month == 5, day == 8) %>% filter(between(hour, 21, 23)) %>% print(n = Inf)#> # A tibble: 9 × 15#> origin year month day hour temp dewp humid wind_dir wind_speed#> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>#> 1 EWR 2013 5 8 21 59 53.1 80.6 80 4.60#> 2 EWR 2013 5 8 22 59 52.0 77.5 90 3.45#> 3 EWR 2013 5 8 23 57.9 52.0 80.6 0 0 #> 4 JFK 2013 5 8 21 57.0 48.9 74.3 170 11.5 #> 5 JFK 2013 5 8 22 13.1 12.0 95.3 80 8.06#> 6 JFK 2013 5 8 23 57.2 53.6 87.7 120 4.60#> 7 LGA 2013 5 8 21 59 48.9 69.2 NA 5.75#> 8 LGA 2013 5 8 22 59 51.1 75.0 100 6.90#> 9 LGA 2013 5 8 23 55.9 51.1 83.7 90 6.90#> # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,#> # visib <dbl>, time_hour <dttm>
min
, max
, mean
, pmin
, pmax
, group_by()
, summarize()
, pivot_wider
, pivot_longer
, regex, joins, etc. Understand how NA's work, what typical default behavior for NA's is, etc. x1 <- c("Dec", "Apr", "Jan", "Mar")
Two issues:
sort(x1)#> [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c( "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
y1 <- factor(x1, levels = month_levels)y1#> [1] Dec Apr Jan Mar#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Decsort(y1)#> [1] Jan Mar Apr Dec#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |