Trumpton

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
trumpton <- read_delim("trumpton.txt")
## Rows: 7 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (2): LastName, FirstName
## dbl (3): Age, Weight, Height
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trumpton

Scatterplot

trumpton %>%
  ggplot(aes(x=Age, y=Weight)) +
  geom_point()

Generally, the older people in the trumpton dataset are heavier.

Filter and select

Find the person who weighs more than 100kg.

trumpton %>%
  filter(Weight > 100) %>%
  select(FirstName, LastName)

Barplot

trumpton %>%
  ggplot(aes(x=LastName, y=Age)) +
  geom_col(fill="magenta2", colour="black")

Child Variants

child <- read_delim("Child_Variants.csv")
## Rows: 25822 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): CHR, dbSNP, REF, ALT, GENE, ENST
## dbl (5): POS, QUAL, MutantReads, COVERAGE, MutantReadPercent
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
child

Filter

Select all of the rows (variants) which occur in the first 5Mbp of Chr X.

x_filtered <- child %>%
  filter(CHR=="X") %>%
  filter(POS <= 5000000)

x_filtered

Scatterplot

x_filtered %>%
  ggplot(aes(x=MutantReads, y=COVERAGE, colour=QUAL)) +
  geom_point()

The low quality calls have low coverage and a small number of mutant reads.

Chr 1 line plot

child %>%
  filter(dbSNP != ".") %>%
  ggplot(aes(x=POS, y=COVERAGE)) +
  geom_line(colour="grey", size=1)

Remove any variants with a coverage > 200

child %>%
  filter(dbSNP != ".") %>%
  filter(COVERAGE <= 200) %>%
  ggplot(aes(x=POS, y=COVERAGE)) +
  geom_line(colour="grey", size=1)