eqtlcandida - Tumblr blog

eQTL analysis for Candida Genome (R code)

gt = read.table("Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr.txt", sep="\t", header=TRUE, row.names = 1) gt <- as.data.frame(as.matrix(t(gt)))

head(gt) dim(gt)

# Calculate MAF and draw MAF histogram maf <- colMeans(gt)/2 maf maf <- pmin(maf, 1-maf) maf

sum(maf > 0.5)

truehist(maf, main = "Histogram of MAF values.", col = "steelblue") lines(density(maf), lty = 2, col = "darkorange", lwd = 3)

write.csv(maf, "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_allchr_maf0.05.csv")

snp_values = read.table("Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr_maf0.05.txt", sep="\t", header=TRUE, row.names = 1)

#snp_values <- as.data.frame(as.matrix(t(snp_values)))

#Obtain the Covarties by PCA of the Genotypes

pca_snps = prcomp(t(snp_values), center=TRUE, scale = F) plot(pca_snps$x[,c(1,2)],pch=16)

head(pca_snps$x[,c(1,2)])

plot(pca_snps) fviz_eig(pca_snps, addlabels = TRUE)

pca_snps$sdev summary(pca_snps)

write.csv(pca_snps$x[,1:10],"Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/input/pca_snps.csv")

# eQTL mapping, cis/trans, no pcs suppressMessages(library(MatrixEQTL))

SNP_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr_maf0.05.txt";

snps_location_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snploc_AllChr_maf0.05.txt"; expression_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/chrAll_FP_sum_GE.txt";

gene_location_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/chrAll_FP_sum_loc.txt";

covariates_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/pca_snps_10.txt";

cis_threshold <- 1e-5 trans_threshold <- 1e-5 cis_dist <- 2e4

# Output file name #output_file_name_cis = tempfile(); #output_file_name_tra = tempfile();

output_file_name_cis = "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output_AllChr_cis_eqtls_PCA20kb.txt"; output_file_name_tra = "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output_AllChr_trans_eqtls_PCA20kb.txt";

## Settings # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Only associations significant at this level will be saved pvOutputThreshold_cis = cis_threshold; pvOutputThreshold_tra = trans_threshold;

# Set to character() for no covariates # covariates_file_name = character();

# Error covariance matrix # Set to numeric() for identity. errorCovariance = numeric();

# Distance for local gene-SNP pairs cisDist = cis_dist

## Load genotype data snps = SlicedData$new(); snps$fileDelimiter = "\t"; # the TAB character snps$fileOmitCharacters = "NA"; # denote missing values; snps$fileSkipRows = 1; # one row of column labels snps$fileSkipColumns = 1; # one column of row labels snps$fileSliceSize = 2000; # read file in slices of 2,000 rows snps$LoadFile(SNP_file_name);

## Load gene expression data gene = SlicedData$new(); gene$fileDelimiter = "\t"; # the TAB character gene$fileOmitCharacters = "NA"; # denote missing values; gene$fileSkipRows = 1; # one row of column labels gene$fileSkipColumns = 1; # one column of row labels gene$fileSliceSize = 2000; # read file in slices of 2,000 rows gene$LoadFile(expression_file_name);

## Load covariates cvrt = SlicedData$new(); cvrt$fileDelimiter = "\t"; # the TAB character cvrt$fileOmitCharacters = "NA"; # denote missing values; cvrt$fileSkipRows = 1; # one row of column labels cvrt$fileSkipColumns = 1; # one column of row labels if(length(covariates_file_name)>0) { cvrt$LoadFile(covariates_file_name); }

## Run the analysis snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);

me = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name_tra, pvOutputThreshold = pvOutputThreshold_tra, useModel = useModel, errorCovariance = errorCovariance, verbose = FALSE, output_file_name.cis = output_file_name_cis, pvOutputThreshold.cis = pvOutputThreshold_cis, snpspos = snpspos, genepos = genepos, cisDist = cisDist, pvalue.hist = TRUE, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE);

me_qq = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name_tra, pvOutputThreshold = pvOutputThreshold_tra, useModel = useModel, errorCovariance = errorCovariance, verbose = FALSE, output_file_name.cis = output_file_name_cis, pvOutputThreshold.cis = pvOutputThreshold_cis, snpspos = snpspos, genepos = genepos, cisDist = cisDist, pvalue.hist = "qqplot", min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE)

unlink(output_file_name_tra); unlink(output_file_name_cis);

cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');

head(me$cis$eqtls) head(me$trans$eqtls)

## Make the histogram of local and distant p-values plot(me)

## Make the qq-plot of local and distant p-values plot(me_qq)

library(dplyr) snp_values = read.table(SNP_file_name, row.names=1, header=TRUE) snp_values = data.frame(snps = rownames(snp_values), snp_values, stringsAsFactors = FALSE) snp_values=snp_values[,-c(1)]

# For this we also need df with expression data gene_values = read.table(expression_file_name, row.names=1, header=TRUE) gene_values = data.frame(gene = rownames(gene_values), gene_values, stringsAsFactors = FALSE) gene_values=gene_values[,-c(1)] View(gene_values)

gene_values_transpose <- as.data.frame(as.matrix(t(gene_values))) snp_values_transpose <- as.data.frame(as.matrix(t(snp_values)))

cis_eqtl_res = me$cis$eqtls cis_eqtl_res = cis_eqtl_res[cis_eqtl_res$FDR < 0.05,] top_eqtls = cis_eqtl_res[order(cis_eqtl_res$pvalue),] top_eqtls = top_eqtls[!duplicated(top_eqtls$gene),] mafs = apply(as.matrix(snp_values[-1]),1,mean)/2 mafs = pmin(mafs, 1 - mafs) mafs = data.frame(snps=names(mafs), maf = mafs) top_eqtls = merge(top_eqtls, mafs, by="snps") top_eqtls = top_eqtls[order(top_eqtls$FDR),] head(top_eqtls) write.table(top_eqtls, "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output/top_cis_eQTL_FP_20kb.txt", sep = "\t")

# Get gene name of gene with lowest association p-value gene_id = top_eqtls$gene[1] gene_id = as.character(gene_id) gene_id # Get corresponding SNP snp_id = top_eqtls[top_eqtls$gene == gene_id,"snps"][1] snp_id = as.character(snp_id) snp_id data = data.frame(t(snp_values[snp_id,]), t(gene_values[gene_id,]))

# # Get reference and alternative allele of the SNP # ref_alt = unlist(snpPos[snpPos$snpid== snp_id, c("ref", "alt")]) # # Prepare the genotype labels # gt_states= c(paste(ref_alt[1], ref_alt[1], sep="/"), paste(ref_alt[1], # ref_alt[2], sep="/"), paste(ref_alt[2], ref_alt[2], sep="/")) # gt_states = factor(gt_states, levels=gt_states) # # Assign the labels # data$gt = gt_states[data[,snp_id]+1] # # Subset to only genotype labels and expression # data = data[,c("gt", gene_id)] # colnames(data) = c("genotype", "expression") # # Plot # p = ggplot(data, aes(genotype, expression)) + # ggtitle(paste("eQTL of gene",gene_id, "with",snp_id))+ # geom_jitter(colour="darkgrey", position=position_jitter(width=0.25)) + # geom_boxplot(outlier.size=0, alpha=0.6, fill="grey") + theme_bw() # print(p) # # # # # # snpPos = data.frame(snps = rownames(snpspos), snpspos, stringsAsFactors = FALSE) # snpPos = snpPos[,-c(1)] # top_snp = top_eqtls$snps[1] # top_gene = as.character(top_eqtls$gene[1]) # # top_snp_data = filter(snp_values, snp_values$snps == top_snp) # top_gene_data = filter(gene_values, gene == top_gene)

plot_data = as.data.frame(data) colnames(plot_data) = c("snp", "gene_expr")

plot_data$snp = as.factor(plot_data$snp) head(plot_data)

lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"])) summary(lm_top)

plot(plot_data, col="steelblue", main = paste0(gene_id, "vs", snp_id)) abline(lm_top, col="darkorange", lwd = 2, lty = 2) y_range = range(plot_data[,"gene_expr"]) text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=", format(summary(lm_top)$coefficients[2,4], scentific=TRUE, digits=2)), col = "black") gene_id = "MDR1"

gene_values = read.table(expression_file_name, row.names=1, header=TRUE) single_gene_exp = SlicedData$new() single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id, , drop=FALSE])) single_gene_exp

snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)

genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

single_cis_eqtl_res = Matrix_eQTL_main(snps, single_gene_exp, verbose = FALSE, output_file_name.cis = NULL, output_file_name = NULL, pvOutputThreshold.cis=1, snpspos = snpspos, genepos = genepos)

manh_data = merge(single_cis_eqtl_res$cis$eqtls, snpspos, by.x =c("snps") , by.y = c("snpid")) manh_data = manh_data [,c("pos", "chr", "pvalue", "snps")] head(manh_data)

# Plot the Manhattanplot with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)", main=paste(gene_id, "associated SNPs"))) # Highlight the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] , points(pos, -log10(pvalue), pch=20, col="red")) # Add a label to the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE], text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))

for (gene_id in top_eqtls$gene[c(1:50,(nrow(top_eqtls)-3):nrow(top_eqtls))]){ print(gene_id) single_gene_exp = SlicedData$new() single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id,, drop=FALSE]))

single_cis_eqtl_res = Matrix_eQTL_main(snps, single_gene_exp, output_file_name.cis = NULL, output_file_name = NULL, pvOutputThreshold.cis=1, verbose = FALSE, snpspos=snpspos, genepos=genepos)

manh_data = merge(single_cis_eqtl_res$cis$eqtl, snpspos, by.x =c("snps") , by.y = c("snpid")) manh_data =manh_data[,c("pos", "chr", "pvalue", "snps")] par(mfrow=c(1,1)) # Plot the Manhattanplot with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)", main=paste(gene_id, "associated SNPs"))) # Highlight the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] , points(pos, -log10(pvalue), pch=20, col="red")) # Add a label to the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE], text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps)) scan(stdin()) }

par(mfrow=c(3,3))

# Get gene name of gene with lowest association p-value gene_id = top_eqtls$gene[63] gene_id = as.character(gene_id) gene_id # Get corresponding SNP snp_id = top_eqtls[top_eqtls$gene == gene_id,"snps"][1] snp_id = as.character(snp_id) snp_id data = data.frame(t(snp_values[snp_id,]), t(gene_values[gene_id,]))

plot_data = as.data.frame(data) colnames(plot_data) = c("snp", "gene_expr")

plot_data$snp = as.factor(plot_data$snp) head(plot_data)

lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"])) summary(lm_top)

plot(plot_data, col="steelblue", main = paste0(gene_id,"vs" , snp_id)) abline(lm_top, col="darkorange", lwd = 2, lty = 2) y_range = range(plot_data[,"gene_expr"]) text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=", format(summary(lm_top)$coefficients[2,4], scentific=TRUE, digits=2)), col = "black")