eqtlcandida
eqtlcandida
Genome Wide eQTL analysis
1 post
Don't wanna be here? Send us removal request.
eqtlcandida · 6 years ago
Text
eQTL analysis for Candida Genome (R code)
gt = read.table("Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr.txt", sep="\t",                header=TRUE, row.names = 1) gt <- as.data.frame(as.matrix(t(gt)))
head(gt) dim(gt)
# Calculate MAF and draw MAF histogram maf <- colMeans(gt)/2 maf maf <- pmin(maf, 1-maf) maf
sum(maf > 0.5)
truehist(maf, main = "Histogram of MAF values.", col = "steelblue") lines(density(maf), lty = 2, col = "darkorange", lwd = 3)
write.csv(maf, "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_allchr_maf0.05.csv")
snp_values = read.table("Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr_maf0.05.txt", sep="\t",                        header=TRUE, row.names = 1)
#snp_values <- as.data.frame(as.matrix(t(snp_values)))
#Obtain the Covarties by PCA of the Genotypes
pca_snps = prcomp(t(snp_values), center=TRUE, scale = F) plot(pca_snps$x[,c(1,2)],pch=16)
head(pca_snps$x[,c(1,2)])
plot(pca_snps) fviz_eig(pca_snps, addlabels = TRUE)
pca_snps$sdev summary(pca_snps)
write.csv(pca_snps$x[,1:10],"Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/input/pca_snps.csv")
# eQTL mapping, cis/trans, no pcs suppressMessages(library(MatrixEQTL))
SNP_file_name <-  "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snps_Allchr_maf0.05.txt";
snps_location_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/snploc_AllChr_maf0.05.txt"; expression_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/chrAll_FP_sum_GE.txt";
gene_location_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/chrAll_FP_sum_loc.txt";
covariates_file_name <- "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/input/pca_snps_10.txt";
cis_threshold <- 1e-5 trans_threshold <- 1e-5 cis_dist <- 2e4
# Output file name #output_file_name_cis = tempfile(); #output_file_name_tra = tempfile();
output_file_name_cis = "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output_AllChr_cis_eqtls_PCA20kb.txt"; output_file_name_tra = "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output_AllChr_trans_eqtls_PCA20kb.txt";
## Settings # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
# Only associations significant at this level will be saved pvOutputThreshold_cis = cis_threshold; pvOutputThreshold_tra = trans_threshold;
# Set to character() for no covariates # covariates_file_name = character();
# Error covariance matrix # Set to numeric() for identity. errorCovariance = numeric();
# Distance for local gene-SNP pairs cisDist = cis_dist
## Load genotype data snps = SlicedData$new(); snps$fileDelimiter = "\t"; # the TAB character snps$fileOmitCharacters = "NA"; # denote missing values; snps$fileSkipRows = 1; # one row of column labels snps$fileSkipColumns = 1; # one column of row labels snps$fileSliceSize = 2000; # read file in slices of 2,000 rows snps$LoadFile(SNP_file_name);
## Load gene expression data gene = SlicedData$new(); gene$fileDelimiter = "\t"; # the TAB character gene$fileOmitCharacters = "NA"; # denote missing values; gene$fileSkipRows = 1; # one row of column labels gene$fileSkipColumns = 1; # one column of row labels gene$fileSliceSize = 2000; # read file in slices of 2,000 rows gene$LoadFile(expression_file_name);
## Load covariates cvrt = SlicedData$new(); cvrt$fileDelimiter = "\t"; # the TAB character cvrt$fileOmitCharacters = "NA"; # denote missing values; cvrt$fileSkipRows = 1; # one row of column labels cvrt$fileSkipColumns = 1; # one column of row labels if(length(covariates_file_name)>0) {  cvrt$LoadFile(covariates_file_name); }
## Run the analysis snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
me = Matrix_eQTL_main(  snps = snps,  gene = gene,  cvrt = cvrt,  output_file_name = output_file_name_tra,  pvOutputThreshold = pvOutputThreshold_tra,  useModel = useModel,  errorCovariance = errorCovariance,  verbose = FALSE,  output_file_name.cis = output_file_name_cis,  pvOutputThreshold.cis = pvOutputThreshold_cis,  snpspos = snpspos,  genepos = genepos,  cisDist = cisDist,  pvalue.hist = TRUE,  min.pv.by.genesnp = FALSE,  noFDRsaveMemory = FALSE);
me_qq = Matrix_eQTL_main(  snps = snps,  gene = gene,  cvrt = cvrt,  output_file_name = output_file_name_tra,  pvOutputThreshold = pvOutputThreshold_tra,  useModel = useModel,  errorCovariance = errorCovariance,  verbose = FALSE,  output_file_name.cis = output_file_name_cis,  pvOutputThreshold.cis = pvOutputThreshold_cis,  snpspos = snpspos,  genepos = genepos,  cisDist = cisDist,  pvalue.hist = "qqplot",  min.pv.by.genesnp = FALSE,  noFDRsaveMemory = FALSE)
unlink(output_file_name_tra); unlink(output_file_name_cis);
cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n');
head(me$cis$eqtls) head(me$trans$eqtls)
## Make the histogram of local and distant p-values plot(me)
## Make the qq-plot of local and distant p-values plot(me_qq)  
library(dplyr) snp_values = read.table(SNP_file_name, row.names=1, header=TRUE) snp_values = data.frame(snps = rownames(snp_values), snp_values, stringsAsFactors = FALSE) snp_values=snp_values[,-c(1)]
# For this we also need df with expression data gene_values = read.table(expression_file_name, row.names=1, header=TRUE) gene_values = data.frame(gene = rownames(gene_values), gene_values, stringsAsFactors = FALSE) gene_values=gene_values[,-c(1)] View(gene_values)
gene_values_transpose <- as.data.frame(as.matrix(t(gene_values))) snp_values_transpose <- as.data.frame(as.matrix(t(snp_values)))
cis_eqtl_res = me$cis$eqtls cis_eqtl_res = cis_eqtl_res[cis_eqtl_res$FDR < 0.05,] top_eqtls = cis_eqtl_res[order(cis_eqtl_res$pvalue),] top_eqtls = top_eqtls[!duplicated(top_eqtls$gene),] mafs = apply(as.matrix(snp_values[-1]),1,mean)/2 mafs = pmin(mafs, 1 - mafs) mafs = data.frame(snps=names(mafs), maf = mafs) top_eqtls = merge(top_eqtls, mafs, by="snps") top_eqtls = top_eqtls[order(top_eqtls$FDR),] head(top_eqtls) write.table(top_eqtls, "Desktop/Abhilash/study/Post-Doc/Lab_notebook/eQTL/Input_3/Differential_regulation_files/eQTL_input/MATRIX_eQTL/whole_genome/output/top_cis_eQTL_FP_20kb.txt", sep = "\t")
# Get gene name of gene with lowest association p-value gene_id = top_eqtls$gene[1] gene_id = as.character(gene_id) gene_id # Get corresponding SNP snp_id = top_eqtls[top_eqtls$gene == gene_id,"snps"][1] snp_id = as.character(snp_id) snp_id data = data.frame(t(snp_values[snp_id,]), t(gene_values[gene_id,]))
# # Get reference and alternative allele of the SNP # ref_alt = unlist(snpPos[snpPos$snpid== snp_id, c("ref", "alt")]) # # Prepare the genotype labels # gt_states= c(paste(ref_alt[1], ref_alt[1], sep="/"), paste(ref_alt[1], #                                                            ref_alt[2], sep="/"), paste(ref_alt[2], ref_alt[2], sep="/")) # gt_states = factor(gt_states, levels=gt_states) # # Assign the labels # data$gt = gt_states[data[,snp_id]+1] # # Subset to only genotype labels and expression # data = data[,c("gt", gene_id)] # colnames(data) = c("genotype", "expression") # # Plot # p = ggplot(data, aes(genotype, expression)) + #   ggtitle(paste("eQTL of gene",gene_id, "with",snp_id))+ #   geom_jitter(colour="darkgrey", position=position_jitter(width=0.25)) + #   geom_boxplot(outlier.size=0, alpha=0.6, fill="grey") + theme_bw() # print(p) # # # # # # snpPos = data.frame(snps = rownames(snpspos), snpspos, stringsAsFactors = FALSE) # snpPos = snpPos[,-c(1)] # top_snp = top_eqtls$snps[1] # top_gene = as.character(top_eqtls$gene[1]) # # top_snp_data = filter(snp_values, snp_values$snps == top_snp) # top_gene_data = filter(gene_values, gene == top_gene)
plot_data = as.data.frame(data) colnames(plot_data) = c("snp", "gene_expr")
plot_data$snp = as.factor(plot_data$snp) head(plot_data)
lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"])) summary(lm_top)
plot(plot_data, col="steelblue",     main = paste0(gene_id, "vs", snp_id)) abline(lm_top, col="darkorange", lwd = 2, lty = 2) y_range = range(plot_data[,"gene_expr"]) text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=",                                                   format(summary(lm_top)$coefficients[2,4],                                                          scentific=TRUE, digits=2)), col = "black") gene_id = "MDR1"
gene_values = read.table(expression_file_name, row.names=1, header=TRUE) single_gene_exp = SlicedData$new() single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id, , drop=FALSE])) single_gene_exp
snpspos = read.table(snps_location_file_name,                     header = TRUE,                     stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name,                     header = TRUE,                     stringsAsFactors = FALSE)
single_cis_eqtl_res = Matrix_eQTL_main(snps,                                       single_gene_exp,                                       verbose = FALSE,                                       output_file_name.cis = NULL,                                       output_file_name = NULL,                                       pvOutputThreshold.cis=1,                                       snpspos = snpspos,                                       genepos = genepos)
manh_data = merge(single_cis_eqtl_res$cis$eqtls, snpspos, by.x =c("snps") , by.y = c("snpid")) manh_data = manh_data [,c("pos", "chr", "pvalue", "snps")] head(manh_data)
# Plot the Manhattanplot with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",                     main=paste(gene_id, "associated SNPs"))) # Highlight the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,     points(pos, -log10(pvalue), pch=20, col="red")) # Add a label to the lead SNP with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],     text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))
for (gene_id in top_eqtls$gene[c(1:50,(nrow(top_eqtls)-3):nrow(top_eqtls))]){  print(gene_id)  single_gene_exp = SlicedData$new()  single_gene_exp$CreateFromMatrix(as.matrix(gene_values[gene_id,, drop=FALSE]))
 single_cis_eqtl_res = Matrix_eQTL_main(snps, single_gene_exp,                                         output_file_name.cis = NULL,                                         output_file_name = NULL,                                         pvOutputThreshold.cis=1,                                         verbose = FALSE,                                         snpspos=snpspos,                                         genepos=genepos)
 manh_data = merge(single_cis_eqtl_res$cis$eqtl,                    snpspos,                    by.x =c("snps") , by.y = c("snpid"))  manh_data =manh_data[,c("pos", "chr", "pvalue", "snps")]  par(mfrow=c(1,1))  # Plot the Manhattanplot  with(manh_data ,plot(pos, -log10(pvalue), xlab = "genomic position (bp)",                       main=paste(gene_id, "associated SNPs")))  # Highlight the lead SNP  with(manh_data[which.min(manh_data$pvalue),,drop=FALSE] ,       points(pos, -log10(pvalue), pch=20, col="red"))  # Add a label to the lead SNP  with(manh_data[which.min(manh_data$pvalue),,drop=FALSE],       text(pos + diff(range(manh_data$pos))*0.2, -log10(pvalue), labels = snps))  scan(stdin()) }
par(mfrow=c(3,3))
# Get gene name of gene with lowest association p-value gene_id = top_eqtls$gene[63] gene_id = as.character(gene_id) gene_id # Get corresponding SNP snp_id = top_eqtls[top_eqtls$gene == gene_id,"snps"][1] snp_id = as.character(snp_id) snp_id data = data.frame(t(snp_values[snp_id,]), t(gene_values[gene_id,]))
plot_data = as.data.frame(data) colnames(plot_data) = c("snp", "gene_expr")
plot_data$snp = as.factor(plot_data$snp) head(plot_data)
lm_top = lm(plot_data[,"gene_expr"] ~ as.numeric(plot_data[,"snp"])) summary(lm_top)
plot(plot_data, col="steelblue",     main = paste0(gene_id,"vs" , snp_id)) abline(lm_top, col="darkorange", lwd = 2, lty = 2) y_range = range(plot_data[,"gene_expr"]) text(x=2, y=y_range[1] + 0.5*diff(y_range), paste0("p=",                                                   format(summary(lm_top)$coefficients[2,4],                                                          scentific=TRUE, digits=2)), col = "black")
0 notes