Help Session Lesson 6
Let's grab some data.
library(tidyverse)
acount_smeta<-read_tsv("../data/countsANDmeta.txt")
acount_smeta
#raw count data
acount<-read_csv("../data/airway_rawcount.csv") %>%
dplyr::rename("Feature" = "...1")
acount
#differential expression results
dexp<-read_delim("../data/diffexp_results_edger_airways.txt")
dexp
All solutions should use the pipe.
-
Filter
acount
("./data/airway_rawcount.csv") to include genes NOT found in our differential expression results (dexp
).Solution}
acount %>% filter(!Feature %in% dexp$feature)
## # A tibble: 48,176 × 9 ## Feature SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSG000000… 0 0 0 0 0 0 ## 2 ENSG000000… 0 0 2 0 1 0 ## 3 ENSG000000… 10 2 9 2 10 6 ## 4 ENSG000000… 3 0 3 1 4 0 ## 5 ENSG000000… 2 0 1 0 0 2 ## 6 ENSG000000… 0 0 1 0 0 0 ## 7 ENSG000000… 4 6 22 10 2 1 ## 8 ENSG000000… 0 0 0 1 1 2 ## 9 ENSG000000… 3 1 0 0 3 1 ## 10 ENSG000000… 0 0 0 0 0 0 ## # ℹ 48,166 more rows ## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>
#OR acount %>% anti_join(dexp, by=c("Feature"="feature"))
## # A tibble: 48,176 × 9 ## Feature SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSG000000… 0 0 0 0 0 0 ## 2 ENSG000000… 0 0 2 0 1 0 ## 3 ENSG000000… 10 2 9 2 10 6 ## 4 ENSG000000… 3 0 3 1 4 0 ## 5 ENSG000000… 2 0 1 0 0 2 ## 6 ENSG000000… 0 0 1 0 0 0 ## 7 ENSG000000… 4 6 22 10 2 1 ## 8 ENSG000000… 0 0 0 1 1 2 ## 9 ENSG000000… 3 1 0 0 3 1 ## 10 ENSG000000… 0 0 0 0 0 0 ## # ℹ 48,166 more rows ## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>
-
From
acounts_smeta
("./data/countsANDmeta.txt"), retrieve the bottom five genes with the lowest mean raw counts bydex
. What are the dimensions of the resulting data frame? Why are there more than 5 rows?Solution}
acount_smeta %>% group_by(dex,Feature) %>% summarize(mean_counts=mean(Count)) %>% slice_min(n=5, order_by=mean_counts) %>% glimpse()
## `summarise()` has grouped output by 'dex'. You can override using the `.groups` ## argument.
## Rows: 67,285 ## Columns: 3 ## Groups: dex [2] ## $ dex <chr> "trt", "trt", "trt", "trt", "trt", "trt", "trt", "trt", "t… ## $ Feature <chr> "ENSG00000000005", "ENSG00000000938", "ENSG00000002726", "… ## $ mean_counts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
-
Using mutate apply a base-10 logarithmic transformation to the numeric columns in
acount
; add a pseudocount of 1 prior to this transformation. Save the resulting data frame to an object called log10counts.Solution}
log10counts<- acount %>% mutate(across(where(is.numeric),~log10(.x+1))) log10counts
## # A tibble: 64,102 × 9 ## Feature SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSG000000… 2.83 2.65 2.94 2.61 3.06 3.02 ## 2 ENSG000000… 0 0 0 0 0 0 ## 3 ENSG000000… 2.67 2.71 2.79 2.56 2.77 2.90 ## 4 ENSG000000… 2.42 2.33 2.42 2.22 2.39 2.52 ## 5 ENSG000000… 1.79 1.75 1.61 1.56 1.90 1.81 ## 6 ENSG000000… 0 0 0.477 0 0.301 0 ## 7 ENSG000000… 3.51 3.57 3.79 3.63 3.83 4.04 ## 8 ENSG000000… 3.16 3.03 3.24 2.95 3.15 3.16 ## 9 ENSG000000… 2.72 2.58 2.78 2.69 2.91 2.85 ## 10 ENSG000000… 2.60 2.37 2.67 2.25 2.82 2.77 ## # ℹ 64,092 more rows ## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>
-
Create a column in
dexp
("./data/diffexp_results_edger_airways.txt") calledExpression
. This column should say "Down-regulated" iflogFC
is less than -1 or "Up-regulated" iflogFC
is greater than 1. All other values should say "None". Hint: Look up help forcase_when()
.Solution}
dexp_new<-dexp %>% mutate(Expression=case_when(logFC < -1 ~ "Down-regulated", logFC > -1 & logFC < 1 ~ "None", logFC > 1 ~ "Up-regulated"))
Challenge question:
-
Calculate the mean raw counts for each gene by treatment in
acount_smeta
. Combine these results with the differential expression results. Your resulting data frame should resemble the following:Solution}
a<-acount_smeta %>% group_by(dex, Feature) %>% summarise(mean_count = mean(Count)) %>% pivot_wider(names_from=dex,values_from=mean_count, names_prefix="Mean_Counts_") %>% right_join(dexp, by=c("Feature" = "feature"))
## `summarise()` has grouped output by 'dex'. You can override using the `.groups` ## argument.
## # A tibble: 15,926 × 12 ## Feature Mean_Counts_trt Mean_Counts_untrt albut transcript ref_genome ## <chr> <dbl> <dbl> <chr> <chr> <chr> ## 1 ENSG00000000003 619. 865 untrt TSPAN6 hg38 ## 2 ENSG00000000419 547. 523 untrt DPM1 hg38 ## 3 ENSG00000000457 234. 250. untrt SCYL3 hg38 ## 4 ENSG00000000460 53.2 63.5 untrt C1orf112 hg38 ## 5 ENSG00000000971 6738. 5331. untrt CFH hg38 ## 6 ENSG00000001036 1123. 1487. untrt FUCA2 hg38 ## 7 ENSG00000001084 573. 658. untrt GCLC hg38 ## 8 ENSG00000001167 316 469 untrt NFYA hg38 ## 9 ENSG00000001460 168. 208 untrt STPG1 hg38 ## 10 ENSG00000001461 2545 3113. untrt NIPAL3 hg38 ## # ℹ 15,916 more rows ## # ℹ 6 more variables: .abundant <lgl>, logFC <dbl>, logCPM <dbl>, F <dbl>, ## # PValue <dbl>, FDR <dbl>