library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
We have three data files, a set of methylation values, a set of expression values and some annotation.
read_tsv("methylation.txt") -> meth
## Parsed with column specification:
## cols(
## Probe = col_character(),
## Region = col_character(),
## Methylation = col_double()
## )
head(meth)
read_tsv("expression.txt") -> expression
## Parsed with column specification:
## cols(
## Gene = col_character(),
## Expression = col_double()
## )
head(expression)
For the annotation we have a problem reading the chromsomes because the text chromosomes come so late in the file. We’ll increase the guess_max
to fix this.
read_tsv("final_annotation.txt", guess_max=1000000) -> annot
## Parsed with column specification:
## cols(
## Probe = col_character(),
## Chromosome = col_character(),
## Start = col_double(),
## End = col_double(),
## Strand = col_character()
## )
head(annot)
We want to bring the data from the three different files together. To do this we’ll need to use join
functions to combine the tables. In the end we’re only going to be interested in genes which have data in all of the different files so we’ll be using inner_join
.
Before we start joining tables we need to sort out the methylation data. At the moment we have duplicated gene names because we have combined methylation values for two different regions, gene body and promoter. To get this to a structure of one line per gene we need to spread the Region and Methylation columns.
To make the join with the expression data easier we’re also going to change the name of the Gene column from Probe to Gene.
meth %>%
spread(key = Region, value = Methylation) %>%
rename(Gene=Probe)-> meth
head(meth)
We also need to change the column name on the annotation data so they all say “Gene”
annot %>%
rename(Gene = Probe) -> annot
head(annot)
Now we can combine this with the expression and annotation tibbles to get a combined dataset.
meth %>%
inner_join(expression) %>%
inner_join(annot) -> combined
## Joining, by = "Gene"
## Joining, by = "Gene"
head(combined)
We’re going to apply some filters to clean up the data. We’ll get rid of:
combined %>%
filter(abs(End - Start) < 10000) %>%
filter(Promoter != -1) %>%
filter(substr(Gene,1,2) != "Gm") -> combined
We want to calculate some summary values from the data. We want to know the average expression , promoter and gene body meth and number of genes per chromosome.
To do this we group_by
chromsome and then use summarise
to calculate the mean values. The number of genes can be obtained with n()
combined %>%
group_by(Chromosome) %>%
summarise(
number_of_genes=n(),
Expression = mean(Expression),
Promoter = mean(Promoter),
Gene_body = mean(Gene_body))
We’d also like to see a tibble with the count of genes on the + and - strands per chromosome and the difference between the two.
We can get the count of genes on each strand by grouping by strand and chromosome, using spread
to put the + and - counts into different columns and then using mutate
to get the difference.
combined %>%
group_by(Chromosome, Strand) %>%
count() %>%
spread(key = Strand, value=n) %>%
mutate(Diff=`-` - `+`)
We want to add a new column to the combined data to say if a gene is highly expressed (expression is > 0). We can do this with mutate
and if_else
combined %>%
mutate(high_expressed=if_else(Expression > 0,"Yes","No")) -> combined
head(combined)
We want to find the median methylation for gene bodies and promoters for genes which are and aren’t highly expressed. This will involve grouping by the new variable we created and then summarising.
combined %>%
group_by(high_expressed) %>%
summarise(Gene_body = median(Gene_body), Promoter = median(Promoter))
Highly expressed gene bodies show much greater methylation than ones which are lowly expressed.
We want to plot out the methylation and expression datasets. We can do this from the combined data we already have.
combined %>%
ggplot(aes(Gene_body)) +
geom_density(fill="yellow")+
xlab("Gene Body Methylation (%)")+
ggtitle("Gene Body Methylation Distribution")
combined %>%
ggplot(aes(Promoter)) +
geom_density(fill="yellow")+
xlab("Promoter Methylation (%)")+
ggtitle("Promoter Methylation Distribution")
combined %>%
ggplot(aes(Expression)) +
geom_density(fill="yellow")+
xlab("Expression (log2RPM)")+
ggtitle("Expression Distribution")
We want to plot the mean methylation per chromosome from the original methylation data.
meth %>%
inner_join(annot) %>%
group_by(Chromosome) %>%
summarise(meth=mean(Gene_body)) %>%
ggplot(aes(x=Chromosome,y=meth))+
geom_col()+
coord_flip()
## Joining, by = "Gene"
Plot a stripchart, split by chromosome of the promoter methylation for the 500 highest expressed genes.
combined %>%
top_n(n=500,wt=Expression) %>%
ggplot(aes(x=Chromosome, y=Promoter)) +
geom_jitter(height=0, width=0.1)
We can do the same for the lowest genes:
combined %>%
top_n(n=500,wt=-Expression) %>%
ggplot(aes(x=Chromosome, y=Promoter)) +
geom_jitter(height=0, width=0.1)
We can see that high promoter methylation is more prevalent in low expressed genes than high ones.
Plot the relationship between Promoter meth and Gene meth, coloured by Expression.
combined %>%
ggplot(aes(x=Promoter, y=Gene_body, colour=Expression))+
geom_point(size=1)+
scale_colour_gradientn(colours=c("blue2","white","red2"))
Plot out the relationship between expression and gene body methylation as a scatterplot.
combined %>%
ggplot(aes(x=Promoter, y=Expression))+
geom_point(size=1, colour="grey")+
geom_density2d(color="black",size=1)
Draw violin plots for the distributions of gene body methylation for high and low expressed genes.
combined %>%
ggplot(aes(x=high_expressed, y=Gene_body)) +
geom_violin(fill="yellow")
Finally, draw a barplot showing mean gene body methylation +/- stdev for high and low expressed genes.
combined %>%
group_by(high_expressed) %>%
summarise(meth=mean(Gene_body), stdev=sd(Gene_body)) %>%
ggplot(aes(x=high_expressed, y=meth, ymin=meth-stdev, ymax=meth+stdev)) +
geom_col(fill="yellow", colour="black")+
geom_errorbar(width=0.3)+
ylab("Methylation Level (%)")+
coord_cartesian(ylim=c(0,100))