Source List

把我自己写的一些常用的函数整理成一个文件,在分析前source这些函数即可。

###################################################
## Project: source list                          ##
## Script Purpose: collect the function I wrote  ##
## Data: 2020.19.09                              ##
## Author: Yiming Sun                            ##
###################################################
# convert loom to seurat, not support tsne object yet
# normalized: if the matrix is normalized by seurat
# CellId_var: the variable in the col_attrs indicate the cell name
# GeneId_var: the variable in the row_attrs indicate the gene name
# meta_var: the variables in the meta.data
# nfeatures,project_name,scale_factor: use to create seurat object
MyloomToSeurat <- function(loom_path,normalized = TRUE,CellId_var='CellID',GeneId_var='Gene',meta_var,project_name = 'loom',nfeatures=2000,scale_factor=10000){
  require(loomR)
  require(Seurat)
  lfile <- connect(filename=loom_path,mode='r+')
  express_matrix <- lfile[['matrix']][,]
  rownames(express_matrix) <- lfile[['col_attrs']][[CellId_var]][]
  colnames(express_matrix) <- lfile[['row_attrs']][[GeneId_var]][]
  express_matrix <- t(express_matrix)
  meta.data <- lfile$get.attribute.df(2,meta_var)
  lfile$close_all()
  if(normalized){
    express_matrix <- expm1(express_matrix)/scale_factor
  }
  seu.obj <- CreateSeuratObject(counts = express_matrix,project = project_name,meta.data = meta.data)
  seu.obj <- NormalizeData(seu.obj)
  seu.obj <- FindVariableFeatures(seu.obj,selection.method = 'vst',nfeatures = nfeatures)
  seu.obj <- ScaleData(seu.obj,features = rownames(seu.obj))
  seu.obj <- RunPCA(seu.obj)
  return(seu.obj)
}
# create pseudo bulk using seurat, note the matrix is from the data slot
# meta.var: the variable in the meta.data to classified the cells
# assay: the assay to use to create the pseudo bulk,either RNA or integrated
MyPseudoBluk <- function(seu.obj,meta.var,assay){
  require(Seurat)
  if (assay == 'integrated') {
    [email protected][,meta.var] <- as.factor(as.character([email protected][,meta.var]))
    cell_group <- levels([email protected][,meta.var])
    pseudo_bulk <- matrix(nrow = dim([email protected][email protected])[1],ncol = length(cell_group))
    rownames(pseudo_bulk) <- rownames([email protected][email protected])
    colnames(pseudo_bulk) <- cell_group
    for (i in 1:length(cell_group)) {
      express <- rowSums([email protected][email protected][,[email protected][,meta.var] == cell_group[i]])/(dim(seu.obj[,[email protected][,meta.var] == cell_group[i]])[2])
      pseudo_bulk[,i] <- express
    }
  } else if (assay == 'RNA') {
    [email protected][,meta.var] <- as.factor(as.character([email protected][,meta.var]))
    cell_group <- levels([email protected][,meta.var])
    pseudo_bulk <- matrix(nrow = dim([email protected][email protected])[1],ncol = length(cell_group))
    rownames(pseudo_bulk) <- rownames([email protected][email protected])
    colnames(pseudo_bulk) <- cell_group
    for (i in 1:length(cell_group)) {
      express <- rowSums([email protected][email protected][,[email protected][,meta.var] == cell_group[i]])/(dim(seu.obj[,[email protected][,meta.var] == cell_group[i]])[2])
      pseudo_bulk[,i] <- express
    }
  } else{
    return('Assay not support!')
  }
  return(pseudo_bulk)
}
# SCT integrate function in Seurat
# object.list: Seurat object list to be integrated
# nfeature: variable genes for each seurat object
# feature_list: genes in the integrated express matrix
My_SCT_Integrate <- function(object.list,nfeature=3000,feature_list){
  require(Seurat)
  for (i in 1:length(object.list)) {
    object.list[[i]] <- SCTransform(object.list[[i]], verbose = T)
  }
  object.features <- SelectIntegrationFeatures(object.list = object.list, nfeatures = nfeature)
  object.list <- PrepSCTIntegration(object.list = object.list, anchor.features = object.features, verbose = T)
  object.anchors <- FindIntegrationAnchors(object.list = object.list, normalization.method = "SCT", anchor.features = object.features, verbose = T)
  object.integrated <- IntegrateData(anchorset = object.anchors, normalization.method = "SCT", verbose = T,features.to.integrate = feature_list)
  DefaultAssay(object.integrated) <- 'integrated'
  object.integrated <- RunPCA(object.integrated,npcs = 50,verbose = TRUE)
  return(object.integrated)
}
# GO filter for the csv file returned by DAVID
# term: GO term or KEGG
GO_filter <- function(file,term = 'GOTERM_BP'){
  gene_cluster <- read.csv(file = file,header = T)
  gene_cluster <- gene_cluster[grep(pattern = term,gene_cluster$Category),]
  if(grepl(pattern = 'GO',term)){
    gene_cluster$Term <- sub(pattern = '^GO.*~','',gene_cluster$Term)
  }
  if(grepl(pattern = 'KEGG',term)){
    gene_cluster$Term <- sub(pattern = '^hsa.*:','',gene_cluster$Term)
    gene_cluster$Term <- sub(pattern = '^mmu.*:','',gene_cluster$Term)
  }
  gene_cluster <- gene_cluster[order(gene_cluster$FDR,decreasing = F),]
  return(gene_cluster)
}
# suggest lambda for NMF in liger
# lambda: lambda list to be tested
# k: k value for NMF
# parl_num: the number of cores to be used
# library_path: .libPaths()
MySuggestLambda <- function(object,lambda,k,knn_k = 20,thresh = 1e-04,max.iters = 100,k2 = 500,ref_dataset = NULL,resolution = 1,parl_num,library_path=c('/data/User/sunym/rstudio/R_library/liger/','/data/User/sunym/rstudio/R_library/Rlib/')){
  require(liger)
  require(parallel)
  res <- matrix(ncol = 2,nrow = length(lambda))
  colnames(res) <- c('lambda','alignment')
  res[,'lambda'] <- lambda
  cl <- makeCluster(parl_num)
  clusterExport(cl,c('object','k','knn_k','thresh','max.iters','k2','ref_dataset','resolution'),envir = environment())
  clusterEvalQ(cl,.libPaths(library_path))
  clusterEvalQ(cl,library(liger))
  alignment_score <- parLapply(cl = cl,lambda,function(x){
    object <- optimizeALS(object,k = k,lambda = x,thresh = thresh,max.iters = max.iters)
    object <- quantileAlignSNF(object,knn_k = knn_k,k2 = k2,ref_dataset = ref_dataset,resolution = resolution)
    return(calcAlignment(object))
  })
  stopCluster(cl)
  res[,'alignment'] <- (alignment_score %>% unlist())
  res <- as.data.frame(res)
  res[,1] <- as.numeric(as.character(res[,1]))
  res[,2] <- as.numeric(as.character(res[,2]))
  p <- ggplot(res,aes(x = lambda,y = alignment))+geom_point()+geom_line()
  return(list(res,p))
}
# suggest k and lambda for NMF
# object: liger object
# k: the range of k
# lambda: the range of lambda
# parl_num: the number of core you want to use
# library_path: the package path
# pic_path: the path to store the picture
MySuggestLiger <- function(object,lambda,k,parl_num,library_path,pic_path,knn_k = 20,quantiles = 50,min_cells = 20,do.center = FALSE,max_sample = 1000,refine.knn = TRUE,eps = 0.9,resolution = 0.4,min_dist = 0.1){
  test_lambda <- c()
  test_k <- c()
  for(i in lambda){
    for(j in k){
      test_lambda <- append(test_lambda,i)
      test_k <- append(test_k,j)
    }
  }
  cl <- makeCluster(parl_num)
  clusterExport(cl,c('library_path','pic_path','test_lambda','test_k','object','knn_k','quantiles','min_cells','do.center','max_sample','refine.knn','eps','resolution','min_dist'),envir = environment())
  clusterEvalQ(cl,.libPaths(library_path))
  clusterEvalQ(cl,library(liger))
  alignment_score <- parLapply(cl = cl,1:length(test_lambda),function(x){
    lambda <- test_lambda[x]
    k <- test_k[x]
    object <- optimizeALS(object,lambda = lambda,k = k)
    #Quantile Normalization and Joint Clustering
    object <- quantile_norm(object,knn_k = knn_k,quantiles = quantiles,min_cells = min_cells,do.center = do.center,max_sample = max_sample,refine.knn = refine.knn,eps = eps)
    object <- louvainCluster(object,resolution = resolution)
    object <- runUMAP(object,distance = 'cosine',min_dist = min_dist)
    all.plots <- plotByDatasetAndCluster(object, axis.labels = c('UMAP 1', 'UMAP 2'), return.plots = T)
    char <- paste('lambda',as.character(lambda),'k',as.character(k),sep = '_')
    char <- paste(pic_path,char,sep = '/')
    char <- paste(char,'pdf',sep = '.')
    pdf(file = char,width = 12,height = 6)
    print(all.plots[[1]] + all.plots[[2]])
    dev.off()
    return(0)
  })
  stopCluster(cl)
  return('done')
}
# homology gene convert
# express_matrix: express_matrix has to be large dgcMatrix format
# note the annotation need 4 cols, which are gene name and ensembl id for the species to be convert and the species to convert to
My_Convert_Homology_Gene_ID <- function(express_matrix,anno){
  #note the annotation need 4 cols, which are gene name and ensembl id for the species to be convert and the species to convert to
  #note express_matrix has to be large dgcMatrix
  require(dplyr)
  anno <- as.data.frame(anno)
  anno <- anno[order(anno[,3],decreasing = FALSE),]
  gene <- rownames(express_matrix)
  gene <- intersect(gene,c(as.character(anno[,1]),as.character(anno[,2])))
  express_matrix <- express_matrix[gene,]
  gene_converted <- c()
  for (i in gene) {
    temp <- anno[anno[,1] == i | anno[,2] == i,]
    if(temp[1,3] == ''){
      gene_converted <- append(gene_converted,as.character(temp[1,4]))
    } else{
      gene_converted <- append(gene_converted,as.character(temp[1,3]))
    }
  }
  anno <- data.frame(gene = gene,gene_converted = gene_converted)
  rownames(anno) <- anno$gene
  gene_duplicated <- gene_converted[duplicated(gene_converted)]
  gene_duplicated <- gene_duplicated[!duplicated(gene_duplicated)]
  gene_unique <- gene_converted[!(gene_converted %in% gene_duplicated)]
  gene_unique <- gene_unique[!duplicated(gene_unique)]
  express_unique <- express_matrix[anno[anno$gene_converted %in% gene_unique,]$gene,]
  rownames(express_unique) <- as.character(anno[rownames(express_unique),]$gene_converted)
  for(i in gene_duplicated){
    temp_gene <- as.character(anno[anno$gene_converted == i,]$gene)
    temp_express <- express_matrix[temp_gene,]
    temp_express <- Matrix::colSums(temp_express)
    temp_express <- t(as.matrix(temp_express))
    rownames(temp_express) <- i
    express_unique <- rbind(express_unique,temp_express)
  }
  return(express_unique)
}

持续更新。

点赞

发表评论