Main version is Florence's code

8085fcda · Amine GHOZLANE · 15549fb5 · 8085fcda · 8085fcda
Commit 8085fcda authored 9 months ago by Amine GHOZLANE
--- a/bin/dbcantoflorence.r
+++ b/bin/dbcantoflorence.r
+#!/usr/bin/env Rscript
+library(optparse)
+library(tidyr)
+parser <- OptionParser(formatter = TitledHelpFormatter)
+parser <- add_option(parser, "--dbcan", help = "DBcan result", default=NULL)
+parser <- add_option(parser, "--annotation", help = "Annotation", default=NULL)
+parser <- add_option(parser, "--output", help = "Output file", default=NULL)
+opts <- parse_args(parser)
+if (is.null(opts$dbcan) || is.null(opts$annotation) || is.null(opts$output)) {
+    print_help(parser)
+    stop("All arguments are required. Please provide values for dbcan, annotation, and output.", call. = FALSE)
+}
+# Script functionality starts here (placeholder for your actual script logic)
+# The following is a placeholder to show the working with the provided arguments
+print(sprintf("DBcan file: %s", opts$dbcan))
+print(sprintf("Annotation file: %s", opts$annotation))
+print(sprintf("Output will be saved to: %s", opts$output))
+dbcan_gut_raw = read.delim(opts$dbcan, stringsAsFactors = FALSE)
+# head(dbcan_gut_raw)
+dbcan_gut_raw$hmmer_clean = gsub("\\([0-9-]+\\)", "", dbcan_gut_raw$HMMER)
+dbcan_gut_raw$dbcan_clean = gsub("_e[0-9]+", "", dbcan_gut_raw$dbCAN_sub)
+dbcan_gut_raw$annotation = sapply(strsplit(paste(dbcan_gut_raw$DIAMOND, dbcan_gut_raw$hmmer_clean, dbcan_gut_raw$dbcan_clean, sep = "+"), split = "+", fixed = T), function(x) {
+# Remove "-" and 2.3.1 etc
+  gh_vec = grep("\\.|-", unique(x), invert = T, value = T)
+# Remove 'PL' if already PL_3
+  gh_vec = gh_vec[sapply(paste0(gh_vec, "_"), function(i) length(grep(i, gh_vec)) == 0)]
+  paste(gh_vec, collapse = ",")
+})
+gene_id_name = read.delim(opts$annotation,
+                          stringsAsFactors = FALSE, header = TRUE)
+# colnames(gene_id_name) = c("gene_id", "gene_name", "gene_length")
+# print(dbcan_gut_raw)
+dbcan_gut_raw$gene_id = gene_id_name[match(dbcan_gut_raw$Gene.ID, gene_id_name$gene_name), "gene_id"]
+# print(dbcan_gut_raw)
+rm(gene_id_name)
+# gc()
+# head(dbcan_gut_raw$annotation)
+dbcan_gut_formated = dbcan_gut_raw[, c("gene_id","Gene.ID","annotation")] %>% separate_rows(annotation, sep = ",")
+names(dbcan_gut_formated)[names(dbcan_gut_formated) == "Gene.ID"] <- "gene_name"
+write.table(dbcan_gut_formated, file = opts$output, row.names = FALSE, quote=FALSE, sep="\t")
\ No newline at end of file
--- a/bin/extract_dbcan.py
+++ b/bin/extract_dbcan.py
@@ -74,6 +74,10 @@ def get_gene_annotation(dbcan_file: Path):
                annotation = [item.split("+") for item in line[2:5] if item != "-"]
            else:
                annotation += [item.split("+").split("(")[0] for item in line[2:5] if item != "-"]
+        if len(gene) > 0:
+            annotation = list(chain(*annotation))
+            annotation = [item.split("(")[0] if "(" else item in item for item in annotation]
+            yield gene, list(set(annotation))
 def main():
    """