Help Session Lesson 6

Let's grab some data.

library(tidyverse)
acount_smeta<-read_tsv("../data/countsANDmeta.txt")
acount_smeta

#raw count data
acount<-read_csv("../data/airway_rawcount.csv") %>%
  dplyr::rename("Feature" = "...1")
acount


#differential expression results
dexp<-read_delim("../data/diffexp_results_edger_airways.txt")
dexp

All solutions should use the pipe.

Filter acount ("./data/airway_rawcount.csv") to include genes NOT found in our differential expression results (dexp).

Solution}

acount %>% filter(!Feature %in% dexp$feature)

## # A tibble: 48,176 × 9
##    Feature     SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
##    <chr>            <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 ENSG000000…          0          0          0          0          0          0
##  2 ENSG000000…          0          0          2          0          1          0
##  3 ENSG000000…         10          2          9          2         10          6
##  4 ENSG000000…          3          0          3          1          4          0
##  5 ENSG000000…          2          0          1          0          0          2
##  6 ENSG000000…          0          0          1          0          0          0
##  7 ENSG000000…          4          6         22         10          2          1
##  8 ENSG000000…          0          0          0          1          1          2
##  9 ENSG000000…          3          1          0          0          3          1
## 10 ENSG000000…          0          0          0          0          0          0
## # ℹ 48,166 more rows
## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>

#OR

acount %>% anti_join(dexp, by=c("Feature"="feature"))

## # A tibble: 48,176 × 9
##    Feature     SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
##    <chr>            <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 ENSG000000…          0          0          0          0          0          0
##  2 ENSG000000…          0          0          2          0          1          0
##  3 ENSG000000…         10          2          9          2         10          6
##  4 ENSG000000…          3          0          3          1          4          0
##  5 ENSG000000…          2          0          1          0          0          2
##  6 ENSG000000…          0          0          1          0          0          0
##  7 ENSG000000…          4          6         22         10          2          1
##  8 ENSG000000…          0          0          0          1          1          2
##  9 ENSG000000…          3          1          0          0          3          1
## 10 ENSG000000…          0          0          0          0          0          0
## # ℹ 48,166 more rows
## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>

From acounts_smeta ("./data/countsANDmeta.txt"), retrieve the bottom five genes with the lowest mean raw counts by dex. What are the dimensions of the resulting data frame? Why are there more than 5 rows?

Solution}

acount_smeta %>% 
  group_by(dex,Feature) %>% 
  summarize(mean_counts=mean(Count))  %>% 
  slice_min(n=5, order_by=mean_counts) %>%
  glimpse()

## `summarise()` has grouped output by 'dex'. You can override using the `.groups`
## argument.

## Rows: 67,285
## Columns: 3
## Groups: dex [2]
## $ dex         <chr> "trt", "trt", "trt", "trt", "trt", "trt", "trt", "trt", "t…
## $ Feature     <chr> "ENSG00000000005", "ENSG00000000938", "ENSG00000002726", "…
## $ mean_counts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Using mutate apply a base-10 logarithmic transformation to the numeric columns in acount; add a pseudocount of 1 prior to this transformation. Save the resulting data frame to an object called log10counts.

Solution}

log10counts<- acount %>% mutate(across(where(is.numeric),~log10(.x+1)))
log10counts

## # A tibble: 64,102 × 9
##    Feature     SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
##    <chr>            <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 ENSG000000…       2.83       2.65      2.94        2.61      3.06        3.02
##  2 ENSG000000…       0          0         0           0         0           0   
##  3 ENSG000000…       2.67       2.71      2.79        2.56      2.77        2.90
##  4 ENSG000000…       2.42       2.33      2.42        2.22      2.39        2.52
##  5 ENSG000000…       1.79       1.75      1.61        1.56      1.90        1.81
##  6 ENSG000000…       0          0         0.477       0         0.301       0   
##  7 ENSG000000…       3.51       3.57      3.79        3.63      3.83        4.04
##  8 ENSG000000…       3.16       3.03      3.24        2.95      3.15        3.16
##  9 ENSG000000…       2.72       2.58      2.78        2.69      2.91        2.85
## 10 ENSG000000…       2.60       2.37      2.67        2.25      2.82        2.77
## # ℹ 64,092 more rows
## # ℹ 2 more variables: SRR1039520 <dbl>, SRR1039521 <dbl>

Create a column in dexp ("./data/diffexp_results_edger_airways.txt") called Expression. This column should say "Down-regulated" if logFC is less than -1 or "Up-regulated" if logFC is greater than 1. All other values should say "None". Hint: Look up help for case_when().

Solution}

dexp_new<-dexp %>% mutate(Expression=case_when(logFC < -1 ~ "Down-regulated",
                                        logFC > -1 & logFC < 1 ~ "None",
                                        logFC > 1 ~ "Up-regulated"))

Challenge question:

Calculate the mean raw counts for each gene by treatment in acount_smeta. Combine these results with the differential expression results. Your resulting data frame should resemble the following:

Solution}

a<-acount_smeta %>% 
  group_by(dex, Feature) %>%
  summarise(mean_count = mean(Count)) %>% 
  pivot_wider(names_from=dex,values_from=mean_count,
              names_prefix="Mean_Counts_") %>%
  right_join(dexp, by=c("Feature" = "feature"))

## `summarise()` has grouped output by 'dex'. You can override using the `.groups`
## argument.

## # A tibble: 15,926 × 12
##    Feature         Mean_Counts_trt Mean_Counts_untrt albut transcript ref_genome
##    <chr>                     <dbl>             <dbl> <chr> <chr>      <chr>     
##  1 ENSG00000000003           619.              865   untrt TSPAN6     hg38      
##  2 ENSG00000000419           547.              523   untrt DPM1       hg38      
##  3 ENSG00000000457           234.              250.  untrt SCYL3      hg38      
##  4 ENSG00000000460            53.2              63.5 untrt C1orf112   hg38      
##  5 ENSG00000000971          6738.             5331.  untrt CFH        hg38      
##  6 ENSG00000001036          1123.             1487.  untrt FUCA2      hg38      
##  7 ENSG00000001084           573.              658.  untrt GCLC       hg38      
##  8 ENSG00000001167           316               469   untrt NFYA       hg38      
##  9 ENSG00000001460           168.              208   untrt STPG1      hg38      
## 10 ENSG00000001461          2545              3113.  untrt NIPAL3     hg38      
## # ℹ 15,916 more rows
## # ℹ 6 more variables: .abundant <lgl>, logFC <dbl>, logCPM <dbl>, F <dbl>,
## #   PValue <dbl>, FDR <dbl>