diff --git a/bin/dbcantoflorence.r b/bin/dbcantoflorence.r new file mode 100644 index 0000000000000000000000000000000000000000..c785940e2c35546f775ec1d3d4614fafea1bdd7c --- /dev/null +++ b/bin/dbcantoflorence.r @@ -0,0 +1,44 @@ +#!/usr/bin/env Rscript +library(optparse) +library(tidyr) + +parser <- OptionParser(formatter = TitledHelpFormatter) +parser <- add_option(parser, "--dbcan", help = "DBcan result", default=NULL) +parser <- add_option(parser, "--annotation", help = "Annotation", default=NULL) +parser <- add_option(parser, "--output", help = "Output file", default=NULL) + +opts <- parse_args(parser) +if (is.null(opts$dbcan) || is.null(opts$annotation) || is.null(opts$output)) { + print_help(parser) + stop("All arguments are required. Please provide values for dbcan, annotation, and output.", call. = FALSE) +} + +# Script functionality starts here (placeholder for your actual script logic) +# The following is a placeholder to show the working with the provided arguments +print(sprintf("DBcan file: %s", opts$dbcan)) +print(sprintf("Annotation file: %s", opts$annotation)) +print(sprintf("Output will be saved to: %s", opts$output)) + +dbcan_gut_raw = read.delim(opts$dbcan, stringsAsFactors = FALSE) +# head(dbcan_gut_raw) +dbcan_gut_raw$hmmer_clean = gsub("\\([0-9-]+\\)", "", dbcan_gut_raw$HMMER) +dbcan_gut_raw$dbcan_clean = gsub("_e[0-9]+", "", dbcan_gut_raw$dbCAN_sub) +dbcan_gut_raw$annotation = sapply(strsplit(paste(dbcan_gut_raw$DIAMOND, dbcan_gut_raw$hmmer_clean, dbcan_gut_raw$dbcan_clean, sep = "+"), split = "+", fixed = T), function(x) { +# Remove "-" and 2.3.1 etc + gh_vec = grep("\\.|-", unique(x), invert = T, value = T) +# Remove 'PL' if already PL_3 + gh_vec = gh_vec[sapply(paste0(gh_vec, "_"), function(i) length(grep(i, gh_vec)) == 0)] + paste(gh_vec, collapse = ",") +}) +gene_id_name = read.delim(opts$annotation, + stringsAsFactors = FALSE, header = TRUE) +# colnames(gene_id_name) = c("gene_id", "gene_name", "gene_length") +# print(dbcan_gut_raw) +dbcan_gut_raw$gene_id = gene_id_name[match(dbcan_gut_raw$Gene.ID, gene_id_name$gene_name), "gene_id"] +# print(dbcan_gut_raw) +rm(gene_id_name) +# gc() +# head(dbcan_gut_raw$annotation) +dbcan_gut_formated = dbcan_gut_raw[, c("gene_id","Gene.ID","annotation")] %>% separate_rows(annotation, sep = ",") +names(dbcan_gut_formated)[names(dbcan_gut_formated) == "Gene.ID"] <- "gene_name" +write.table(dbcan_gut_formated, file = opts$output, row.names = FALSE, quote=FALSE, sep="\t") \ No newline at end of file diff --git a/bin/extract_dbcan.py b/bin/extract_dbcan.py index a82622ef81dfb795e81d9f3b196476e97a24f388..0fae37045f644405f16a1ef13ef42b3948c375fe 100644 --- a/bin/extract_dbcan.py +++ b/bin/extract_dbcan.py @@ -74,6 +74,10 @@ def get_gene_annotation(dbcan_file: Path): annotation = [item.split("+") for item in line[2:5] if item != "-"] else: annotation += [item.split("+").split("(")[0] for item in line[2:5] if item != "-"] + if len(gene) > 0: + annotation = list(chain(*annotation)) + annotation = [item.split("(")[0] if "(" else item in item for item in annotation] + yield gene, list(set(annotation)) def main(): """