Help Session Lesson 5
All solutions should use the pipe.
-
Import the file "./data/filtlowabund_scaledcounts_airways.txt" and save to an object named
sc
. Create a subset data frame fromsc
that only includes the columnssample
,cell
,dex
,transcript
, andcounts_scaled
and only rows that include the treatment "untrt" and the transcripts "ACTN1" and "ANAPC4"?Solution}
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── ## ✔ dplyr 1.1.3 ✔ readr 2.1.4 ## ✔ forcats 1.0.0 ✔ stringr 1.5.0 ## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1 ## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0 ## ✔ purrr 1.0.2 ## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag() ## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sc<-read_delim("../data/filtlowabund_scaledcounts_airways.txt")
## Rows: 127408 Columns: 18 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "\t" ## chr (11): feature, SampleName, cell, dex, albut, Run, Experiment, Sample, Bi... ## dbl (6): sample, counts, avgLength, TMM, multiplier, counts_scaled ## lgl (1): .abundant ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cnames<-c('sample', 'cell', 'dex', 'transcript', 'counts_scaled') sc<-sc %>% select(all_of(cnames)) %>% filter(dex == "untrt" & (transcript %in% c("ACTN1","ANAPC4") ))
-
Using
dexp
("./data/diffexp_results_edger_airways.txt") create a data frame containing the top 5 differentially expressed genes and save to an object namedtop5
. Top genes in this case will have the smallestFDR
corrected p-value and an absolute value of the log fold change greater than 2. Seedplyr::slice()
.Solution}
dexp<-read_delim("../data/diffexp_results_edger_airways.txt")
## Rows: 15926 Columns: 10 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "\t" ## chr (4): feature, albut, transcript, ref_genome ## dbl (5): logFC, logCPM, F, PValue, FDR ## lgl (1): .abundant ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top5<- dexp %>% dplyr::filter(abs(logFC) > 2) %>% arrange(FDR) %>% slice_head(n=5)
-
Filter
sc
to contain only the top 5 differentially expressed genes.Solution}
sc %>% dplyr::filter(transcript %in% top5$transcript)
## # A tibble: 0 × 5 ## # ℹ 5 variables: sample <dbl>, cell <chr>, dex <chr>, transcript <chr>, ## # counts_scaled <dbl>
-
Select only columns of type character from
sc
.Solution}
sc %>% select(where(is.character))
## # A tibble: 8 × 3 ## cell dex transcript ## <chr> <chr> <chr> ## 1 N61311 untrt ANAPC4 ## 2 N61311 untrt ACTN1 ## 3 N052611 untrt ANAPC4 ## 4 N052611 untrt ACTN1 ## 5 N080611 untrt ANAPC4 ## 6 N080611 untrt ACTN1 ## 7 N061011 untrt ANAPC4 ## 8 N061011 untrt ACTN1
-
Select all columns from
dexp
except.abundant
andPValue
. Keep only rows withFDR
less than or equal to 0.01.Solution}
dexp %>% select(-c(.abundant,PValue)) %>% filter(FDR <= 0.01)
## # A tibble: 2,763 × 8 ## feature albut transcript ref_genome logFC logCPM F FDR ## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> ## 1 ENSG00000000003 untrt TSPAN6 hg38 -0.390 5.06 32.8 0.00283 ## 2 ENSG00000000971 untrt CFH hg38 0.417 8.09 29.3 0.00376 ## 3 ENSG00000001167 untrt NFYA hg38 -0.509 4.13 44.9 0.00126 ## 4 ENSG00000002834 untrt LASP1 hg38 0.388 8.39 22.7 0.00722 ## 5 ENSG00000003096 untrt KLHL13 hg38 -0.949 4.16 84.8 0.000234 ## 6 ENSG00000003402 untrt CFLAR hg38 1.18 6.90 130. 0.0000800 ## 7 ENSG00000003987 untrt MTMR7 hg38 0.993 0.341 24.7 0.00585 ## 8 ENSG00000004059 untrt ARF5 hg38 0.358 5.84 30.9 0.00328 ## 9 ENSG00000004487 untrt KDM1A hg38 -0.308 5.86 23.5 0.00663 ## 10 ENSG00000004700 untrt RECQL hg38 0.360 5.60 22.7 0.00721 ## # ℹ 2,753 more rows
-
Import the file "./data/airway_rawcount.csv". Use the function
rename()
to rename the first column. Use the pipe frommagrittr
to import and rename successively without intermediate steps or function nesting. Save to an object namedacount
.Solution}
acount<-read_csv("../data/airway_rawcount.csv") %>% dplyr::rename("Feature" = "...1")
## New names: ## Rows: 64102 Columns: 9 ## ── Column specification ## ──────────────────────────────────────────────────────── Delimiter: "," chr ## (1): ...1 dbl (8): SRR1039508, SRR1039509, SRR1039512, SRR1039513, SRR1039516, ## SRR1039... ## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ ## Specify the column types or set `show_col_types = FALSE` to quiet this message. ## • `` -> `...1`
-
Use filter on the object
acount
to keep only genes that had a count greater than 10 in at least one sampleSolution}
bcount<- acount %>% filter(if_any(where(is.numeric), ~.> 10))
-
Challenge Question: Filter genes from
acount
that had a total count less than ten across all samples. Hint: Usecolumn_to_rownames
and look up rowSums().Solution}
f_acount<-acount %>% column_to_rownames("Feature") %>% filter(rowSums(.) > 10)