Help Session Lesson 3
Loading data
-
Import data from the sheet "iris_data_long" from the excel workbook (file_path = "./data/iris_data.xlsx"). Make sure the column names are unique and do not contain spaces. Save the imported data to an object called
iris_long
.Solution}
iris_long<-readxl::read_excel("../data/iris_data.xlsx",sheet="iris_data_long",.name_repair="universal",skip=3) iris_long
## # A tibble: 600 × 4 ## Iris.ID Species Measurement.location Measurement ## <dbl> <chr> <chr> <dbl> ## 1 1 setosa Sepal.Length 5.1 ## 2 1 setosa Sepal.Width 3.5 ## 3 1 setosa Petal.Length 1.4 ## 4 1 setosa Petal.Width 0.2 ## 5 2 setosa Sepal.Length 4.9 ## 6 2 setosa Sepal.Width 3 ## 7 2 setosa Petal.Length 1.4 ## 8 2 setosa Petal.Width 0.2 ## 9 3 setosa Sepal.Length 4.7 ## 10 3 setosa Sepal.Width 3.2 ## # ℹ 590 more rows
-
Import a tab delimited file (file_path= "./data/species_datacarpentry.txt"). Save the file to an object named
species
.genus
,species
, andtaxa
should be converted to factors upon import.Solution}
species<-readr::read_delim("../data/species_datacarpentry.txt",col_types="cfff") species
## # A tibble: 54 × 4 ## species_id genus species taxa ## <chr> <fct> <fct> <fct> ## 1 AB Amphispiza bilineata Bird ## 2 AH Ammospermophilus harrisi Rodent ## 3 AS Ammodramus savannarum Bird ## 4 BA Baiomys taylori Rodent ## 5 CB Campylorhynchus brunneicapillus Bird ## 6 CM Calamospiza melanocorys Bird ## 7 CQ Callipepla squamata Bird ## 8 CS Crotalus scutalatus Reptile ## 9 CT Cnemidophorus tigris Reptile ## 10 CU Cnemidophorus uniparens Reptile ## # ℹ 44 more rows
-
Load in a comma separated file with row names present (file_path= "./data/countB.csv") and save to an object named
countB
.Solution}
countB<-read.csv("../data/countB.csv",row.names=1) head(countB)
## SampleA_1 SampleA_2 SampleA_3 SampleB_1 SampleB_2 SampleB_3 ## Tspan6 703 567 867 71 970 242 ## TNMD 490 482 18 342 935 469 ## DPM1 921 797 622 661 8 500 ## SCYL3 335 216 222 774 979 793 ## FGR 574 574 515 584 941 344 ## CFH 577 792 672 104 192 936
Challenge data load
-
Load in a tab delimited file (file_path= "./data/WebexSession_report.txt") using
read_delim()
. You will need to troubleshoot the error message and modify the function arguments as needed.Solution}
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── ## ✔ dplyr 1.1.3 ✔ readr 2.1.4 ## ✔ forcats 1.0.0 ✔ stringr 1.5.0 ## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1 ## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0 ## ✔ purrr 1.0.2 ## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag() ## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
read_delim("../data/WebexSession_report.txt",delim="\t",locale = locale(encoding = 'UTF-16'),skip=2) #via readr
## Rows: 10 Columns: 21 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "\t" ## chr (7): Name, Date, Invited, Registered, Duration, Network joined from:, ... ## dbl (1): Participant ## lgl (11): Audio Type, Email, Company, Title, Phone Number, Address 1, Addre... ## time (2): Start time, End time ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 10 × 21 ## Participant `Audio Type` Name Email Date Invited Registered `Start time` ## <dbl> <lgl> <chr> <lgl> <chr> <chr> <chr> <time> ## 1 1 NA Partici… NA 6/8/… No N/A 13:00 ## 2 2 NA Partici… NA 6/9/… <NA> <NA> 13:00 ## 3 3 NA Partici… NA 6/10… No N/A 12:57 ## 4 4 NA Partici… NA 6/11… <NA> <NA> 12:57 ## 5 5 NA Partici… NA 6/12… No N/A 12:55 ## 6 6 NA Partici… NA 6/13… <NA> <NA> 12:55 ## 7 7 NA Partici… NA 6/14… No N/A 12:32 ## 8 8 NA Partici… NA 6/15… <NA> <NA> 12:32 ## 9 9 NA Partici… NA 6/16… Yes N/A 12:42 ## 10 NA NA <NA> NA <NA> <NA> <NA> NA ## # ℹ 13 more variables: `End time` <time>, Duration <chr>, Company <lgl>, ## # Title <lgl>, `Phone Number` <lgl>, `Address 1` <lgl>, `Address 2` <lgl>, ## # City <lgl>, `State/Province` <lgl>, `Zip/Postal Code` <lgl>, ## # `Country/region` <lgl>, `Network joined from:` <chr>, ## # `Internal Participant:` <chr>
head(read.delim("../data/WebexSession_report.txt", fileEncoding="UTF-16LE")) #via base R
## All.sessions.in.Eastern.Daylight.Time..New.York..GMT.04.00. X ## 1 Session detail for 'A Webex Meeting of some type': ## 2 Participant Audio Type ## 3 1 ## 4 2 ## 5 3 ## 6 4 ## X.1 X.2 X.3 X.4 X.5 X.6 X.7 X.8 ## 1 ## 2 Name Email Date Invited Registered Start time End time Duration ## 3 Participant 1 <NA> 6/8/22 No N/A 1:00 PM 1:59 PM 59 mins ## 4 Participant 2 <NA> 6/9/22 1:00 PM 1:59 PM 59 mins ## 5 Participant 3 <NA> 6/10/22 No N/A 12:57 PM 2:06 PM 69 mins ## 6 Participant 4 <NA> 6/11/22 12:57 PM 2:06 PM 69 mins ## X.9 X.10 X.11 X.12 X.13 X.14 X.15 ## 1 ## 2 Company Title Phone Number Address 1 Address 2 City State/Province ## 3 ## 4 ## 5 ## 6 ## X.16 X.17 X.18 X.19 ## 1 ## 2 Zip/Postal Code Country/region Network joined from: Internal Participant: ## 3 External Yes ## 4 ## 5 External Yes ## 6
Reshaping data
-
Reshape
iris_long
to a wide format. Your new column names will contain names fromMeasurement.location
. Your wide data should look as follows:## # A tibble: 150 × 6 ## Iris.ID Species Sepal.Length Sepal.Width Petal.Length Petal.Width ## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> ## 1 1 setosa 5.1 3.5 1.4 0.2 ## 2 2 setosa 4.9 3 1.4 0.2 ## 3 3 setosa 4.7 3.2 1.3 0.2 ## 4 4 setosa 4.6 3.1 1.5 0.2 ## 5 5 setosa 5 3.6 1.4 0.2 ## 6 6 setosa 5.4 3.9 1.7 0.4 ## 7 7 setosa 4.6 3.4 1.4 0.3 ## 8 8 setosa 5 3.4 1.5 0.2 ## 9 9 setosa 4.4 2.9 1.4 0.2 ## 10 10 setosa 4.9 3.1 1.5 0.1 ## # ℹ 140 more rows
Solution}
tidyr::pivot_wider(iris_long, names_from = Measurement.location, values_from = Measurement)
-
Let's use
table4a
from thetidyr
package. Usepivot_longer()
to place the year columns in a column namedyear
and their values in a column namedcases
.library(tidyr) data(table4a) table4a
## # A tibble: 3 × 3 ## country `1999` `2000` ## <chr> <dbl> <dbl> ## 1 Afghanistan 745 2666 ## 2 Brazil 37737 80488 ## 3 China 212258 213766
Solution}
pivot_longer(table4a,2:3, names_to = "year", values_to = "cases")
## # A tibble: 6 × 3 ## country year cases ## <chr> <chr> <dbl> ## 1 Afghanistan 1999 745 ## 2 Afghanistan 2000 2666 ## 3 Brazil 1999 37737 ## 4 Brazil 2000 80488 ## 5 China 1999 212258 ## 6 China 2000 213766
-
Separate the column
rate
from tidyr's table3 into two columns:cases
andpopulation
.data(table3) table3
## # A tibble: 6 × 3 ## country year rate ## <chr> <dbl> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583
Solution}
separate(table3, rate, into = c("cases", "population"))
## # A tibble: 6 × 4 ## country year cases population ## <chr> <dbl> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
Reshape challenge
-
Use
pivot_longer
to reshape countB. Your reshaped data should look the same as the data below.Solution}
library(tidyverse) countB<-read.csv("../data/countB.csv",row.names=1) %>% rownames_to_column("Feature") countB_l<-pivot_longer(countB, cols=2:length(countB), names_to = c(".value", "Replicate"), names_sep = "_") #reshaping data so that all replicates are stacked in a single column by treatment tibble(countB_l)
## # A tibble: 27 × 4 ## Feature Replicate SampleA SampleB ## <chr> <chr> <int> <int> ## 1 Tspan6 1 703 71 ## 2 Tspan6 2 567 970 ## 3 Tspan6 3 867 242 ## 4 TNMD 1 490 342 ## 5 TNMD 2 482 935 ## 6 TNMD 3 18 469 ## 7 DPM1 1 921 661 ## 8 DPM1 2 797 8 ## 9 DPM1 3 622 500 ## 10 SCYL3 1 335 774 ## # ℹ 17 more rows