# Information ---------------------------------------------------------------- # Introduction to cancer genomics course, UNIL, day4 exercises. # ############## # Note: # Similarly to day 3 exercises, most of the code is already given in this script # and includes some explanations about the functions we used and why / how. # Even if many lines of codes are already provided, try understanding what they # are doing and ask us if you need help! # There are also some lines of codes that are missing (written # with ### MISSING CODE ###) and that you need to complete. If you don't find # how to do it, try looking back at some code from day 3, which contained some # of these "missing lines", or if still no idea, ask us for help. ############## # Preparation ------------------------------------------------------------- ########################################################################### # Find the directory created for this exercise in RStudio's 'Files' tab/panel # and make it the 'Working Directory', for example using # 'Session' -> 'Working Directory' -> 'To Files Pane Location' # make sure that this directory contains the present script # ('code_intro_cancer_genomics_day4.R'). Make sure this directory also contains # a subdirectory "Data" where you have copied the two files containing the TCGA # data used today. # Importing the gene expression data ----------------------------------------- fileCounts <- "Data/unc.edu_SKCM_IlluminaHiSeq_RNASeqV2.geneExp.tsv" normCounts <- read.delim(fileCounts, as.is=T, check.names=F) # To have an idead of the content of normCounts, show just some of the rows # and some of the columns: normCounts[1:6,1:3] # The gene_id column gives two values: the gene symbol followed by '|' and the # entrez id. We want to keep only the symbol for (i in 1:nrow(normCounts)){ fullGene <- normCounts[i,"gene_id"] geneName <- strsplit(fullGene, split="|", fixed=T)[[1]][1] # We use "fixed=T" to tell that the symbol "|" needs to be really this symbol # otherwise R thinks it is some symbol that has a special meaning. normCounts[i,"gene_id"] <- geneName } # Another more advanced way to obtain these gene names would be with help of # gsub as written in the next line: # normCounts$gene_id <- gsub("\\|.*", "", normCounts$gene_id) # We show again the first rows and columns to see the difference due to the # change made in previous lines of code normCounts[1:6,1:3] # Get the gene names (was 1st column of imported data) and make the normCounts # is only a matrix with the counts (i.e. without this 1st gene names column) genes <- normCounts[,1] normCounts <- t(as.matrix(normCounts[,-1])) # (note we also transposed this matrix to have the rows corresponding to the # samples and the columns corresponding to genes). # Check the new format of normCounts normCounts[1:6,1:3] genes[1:3] ### MISSING CODE ### # Similarly than in day3 exercises, there are some duplicated gene names. # Remove these duplicated genes and then give column names to normCounts, # corresponding to these unique gene names. ### # Importing clinical data ------------------------------------------------- clinFile <- "Data/nationwidechildrens.org_SKCM_bio.patient.tsv" clinData <- read.delim(clinFile, as.is=T) # The data from clinData doesn't have exactly the same samples and in the same # order so we'll need first to match the columns from normCounts with the # sample ids from the clinData. samples <- substr(rownames(normCounts), start=1, stop=12) # This function "substr" will return the first 12 characters from each column # name of normCounts. We need to do this because the sample ids from the clinical # data correspond only to these first characters. indMatch <- match(samples, clinData$sample) # Now that we have the matches, we can create the clinical.table with the same # rows and in the same order than in normCounts. clinical.table <- clinData[indMatch,] # And define the row names of this data.frame if we want to later get the data # from some specific sample names. rownames(clinical.table) <- rownames(normCounts) # Output the first rows and columns of this clinical.table clinical.table[1:5, 1:3] # (we can see that the values in the column "sample" correspond well to the # first characters of the rownames) # Exercise 1 ------------------------------------------------------------- ########################################################################### # Making a plot showing the correlation between two genes ----------------- x_gene <- "CD3E" y_gene <- "LCK" plot(normCounts[,x_gene], normCounts[,y_gene], xlab=x_gene, ylab=y_gene) # The function cor.test computes the correlation between two vectors. Here we # compute the spearman correlation but it can also be used for pearson # correlation. correl <- cor.test(normCounts[,x_gene], normCounts[,y_gene], method="spearman") # correl is a variable with multiple elements. For here, we can access the: # - correlation R-value with: correl$estimate # - correlation p-value with: correl$p.value mtext(text=paste("Spearman correl:", format(correl$estimate, digits=2), ", p-val:", format(correl$p.value, digits=2)), side=3) # mtext is there to add some text on the figure. The "format" function writes # the numbers with better formatting. # Exercise 2 ------------------------------------------------------------- ########################################################################### # Make a histogram of some genes, and then a boxplot in function of the tumor # site of origin of the sample ### MISSING CODE ### # For one (or more) of your genes of interest, do a histogram of its expression # in all the samples (hint: check back yesterday's exercise if you don't # remember how to do it). ### # We then obtain the tumor site of each sample samples_site <- clinical.table[,"submitted_tumor_site"] # The column "submitted_tumor_site" indicates if the tumor is coming from a # primary tumor or else # Check which are the different tumor sites of samples that are present in the data unique(samples_site) ### MISSING CODE ### # Do a boxplot of your gene of interst, grouping the samples by the "samples_site" variable ### # Keeping only samples from specific tumor sites ------------------- # We only want to keep the groups "Primary Tumor" and "Regional Lymph Node", so # we remove the other samples from the data (another way would be to set NA # values in the samples_site-variable for the samples we don't want to include so # that these samples would not be included when doing the boxplots below): # Before modifying the variables clinical.table and normCounts to remove some # samples, we make a copy of them in case we need back all the data. clinical.table.original <- clinical.table normCounts.original <- normCounts samples_toKeep <- (samples_site %in% c("Primary Tumor", "Regional Lymph Node")) samples_site <- samples_site[samples_toKeep] clinical.table <- clinical.table[samples_toKeep,] ### MISSING CODE ### Modify normCounts in a similar way to keep only these samples normCounts <- ... ### # Exercise 3 ------------------------------------------------------------- ########################################################################### # Making plots of the expression of some genes comparing --------------- # samples from primary tumor and lymph node metastasis # First doing the figure for just one gene. y <- normCounts[,"CD3E"] boxplot(y ~ samples_site, notch=T, ylab="CD3E", main="CD3E") wilcox <- wilcox.test(y ~ samples_site)$p.value # We use a Wilcoxon test to check for the significance of the difference # between the two groups. wilcox.str <- paste("wilcox.test:", format(wilcox, digits=2)) mtext(wilcox.str, side=3) # Add the test result on the figure. # Same figure but using a log scale which looks better y <- normCounts[,"CD3E"] + 1e-2 # We need to add a small number for the logscale to avoid log(0) in the samples # where the given gene would not be expressed at all (which is the case for CD19 # for example in some samples.) boxplot(y ~ samples_site, notch=T, ylab="CD3E", main="CD3E", log="y") wilcox <- wilcox.test(y ~ samples_site)$p.value wilcox.str <- paste("wilcox.test:", format(wilcox, digits=2)) mtext(wilcox.str, side=3) # And now we want to make the same figures but for multiple genes one after the # other. We will save these results in a pdf file. genesInterest <- c("CD3E", "CD19", "LCK", "PTPRC") pdf("genes_boxplots.pdf") for (cGene in genesInterest){ ### MISSING CODE ### # Adapt above's code to do similar boxplot figure than above for the various # genes from genesInterest - (do the figures in log scale and don't forget # to show in the figure the name of the current gene) #### } dev.off() # Excercise 4 ------------------------------------------------------------- ########################################################################### # Using EPIC web-application to estimate the proportion of different cells # present in the tumor. We developed this method. It estimates various cells # proportions in a bulk gene expression sample. The web application is available # at: http://epic.gfellerlab.org # First, we need to export the matrix of the gene expression (need to have the # genes along the rows, therefore we take the transpose of normCounts below). # We consider here only the primary tumor samples: normCounts_primary <- normCounts[samples_site %in% "Primary Tumor",] write.table(t(normCounts_primary), file="TCGA_SKCM_primary_tumors.txt", quote=F, sep="\t") # Exercise 5 -------------------------------------------------------------- # Using EPIC R package to analyze the data and do similar figures than above ----- # We first need to install it (this is only needed once): install.packages("devtools") # This package is also needed for EPIC's installation devtools::install_github("GfellerLab/EPIC") # And load it in the current session of R: library(EPIC) cellProportions <- EPIC(bulk=t(normCounts))$cellFractions # Note that we used the default reference profile and parameters. # Then we used the "$cellFractions" as we are interested here by the proportions # of the various cell types. You can get more information about the options # available in EPIC by typing for example "?EPIC" # Checking how this variables look like: head(cellProportions) # Determine which cell types are estimated by EPIC (it depends on the used # reference profile): colnames(cellProportions) # The "otherCells" type corresponds to the other cell types for which no # reference gene expression profile is available (i.e. mostly the cancer cells) ### MISSING CODE ### # For each of these cell types, do a boxplot showing the difference of # infiltration between the primary tumor and regional lymph node samples. # Save these figures in a pdf file. # Hint: the variable cellProportions is a matrix, where each row corresponds to # a sample and each column is the proportion predicted for the cell types. # You will thus need to use a for loop that goes over the different columns of # this cellProportions and do a boxplot as above # based first on the 1st column of this cellProportions, then do a new boxplot # based on the 2nd column, etc. (you can use "colnames(cellProportions)" to get # the names of each column). Don't forget to indicate the name of each cell type # in the figure of the corresponding boxplot. #### # Excercise 6 ------------------------------------------------------------- ########################################################################### # Getting some gene signatures ------------------------------------ # We first need to install the package GSEABase. With this package, we can # easily download some gene signatures that can be found on: # http://software.broadinstitute.org/gsea/msigdb # The package can be installed by executing the new 2 lines of code: source("https://bioconductor.org/biocLite.R") biocLite("GSEABase") # And then, to load this library into the current session: library(GSEABase) ##### # Note: If there is an issue installing the package GSEABase, I've saved the # gene list obtained from it (the variable "geneSigList" from below) in some # RData file that you can load instead of running the next lines of code. To # load this file, run the next line (removing the comment sign "#"): # load("Data/geneSig.RData") #### # First load the gene signatures of interest with help of the function from # GSEABase package: geneSig <- getBroadSets( c("http://software.broadinstitute.org/gsea/msigdb/cards/REACTOME_IMMUNE_SYSTEM.xml", "http://software.broadinstitute.org/gsea/msigdb/cards/REACTOME_ADAPTIVE_IMMUNE_SYSTEM.xml") ) # Computing the enrichment scores based on ssGSEA methods ---------------- # To compute the single sample gene set enrichment analysis scores, we will use # here the package GSEABase. It can be installed by executing these 2 lines of # code: source("https://bioconductor.org/biocLite.R") biocLite("GSVA") # And then,to load this library into the current session: library(GSVA) # package to compute the ssGSEA scores ##### # Note: If there is an issue installing the package GSVA, I've saved the # result obtained from it (i.e. the score.ssGSEA from below) in some # RData file that you can load instead of running the gsva function from below. # To load this file, run the next line (removing the comment sign "#"): # load("Data/score.ssGSEA.RData") #### score.ssGSEA <- t(gsva(t(normCounts), geneSig, rnaseq=T, method="ssgsea", abs.ranking=F)) # This function computes the single sample gene set enrichment analysis score # based on the genes signatures given here in geneSig. # Checking how this variables look like: head(score.ssGSEA) ### MISSING CODE ### # Using these scores, make boxplots showing the differences between the score in # the primary tumor samples and regional lymph node ones (do this for the # "immune system" and "adaptive immune system" gene signatures from REACTOME). ####