Skip to content
Snippets Groups Projects
Commit 8085fcda authored by Amine  GHOZLANE's avatar Amine GHOZLANE
Browse files

Main version is Florence's code

parent 15549fb5
Branches
Tags
No related merge requests found
#!/usr/bin/env Rscript
library(optparse)
library(tidyr)
parser <- OptionParser(formatter = TitledHelpFormatter)
parser <- add_option(parser, "--dbcan", help = "DBcan result", default=NULL)
parser <- add_option(parser, "--annotation", help = "Annotation", default=NULL)
parser <- add_option(parser, "--output", help = "Output file", default=NULL)
opts <- parse_args(parser)
if (is.null(opts$dbcan) || is.null(opts$annotation) || is.null(opts$output)) {
print_help(parser)
stop("All arguments are required. Please provide values for dbcan, annotation, and output.", call. = FALSE)
}
# Script functionality starts here (placeholder for your actual script logic)
# The following is a placeholder to show the working with the provided arguments
print(sprintf("DBcan file: %s", opts$dbcan))
print(sprintf("Annotation file: %s", opts$annotation))
print(sprintf("Output will be saved to: %s", opts$output))
dbcan_gut_raw = read.delim(opts$dbcan, stringsAsFactors = FALSE)
# head(dbcan_gut_raw)
dbcan_gut_raw$hmmer_clean = gsub("\\([0-9-]+\\)", "", dbcan_gut_raw$HMMER)
dbcan_gut_raw$dbcan_clean = gsub("_e[0-9]+", "", dbcan_gut_raw$dbCAN_sub)
dbcan_gut_raw$annotation = sapply(strsplit(paste(dbcan_gut_raw$DIAMOND, dbcan_gut_raw$hmmer_clean, dbcan_gut_raw$dbcan_clean, sep = "+"), split = "+", fixed = T), function(x) {
# Remove "-" and 2.3.1 etc
gh_vec = grep("\\.|-", unique(x), invert = T, value = T)
# Remove 'PL' if already PL_3
gh_vec = gh_vec[sapply(paste0(gh_vec, "_"), function(i) length(grep(i, gh_vec)) == 0)]
paste(gh_vec, collapse = ",")
})
gene_id_name = read.delim(opts$annotation,
stringsAsFactors = FALSE, header = TRUE)
# colnames(gene_id_name) = c("gene_id", "gene_name", "gene_length")
# print(dbcan_gut_raw)
dbcan_gut_raw$gene_id = gene_id_name[match(dbcan_gut_raw$Gene.ID, gene_id_name$gene_name), "gene_id"]
# print(dbcan_gut_raw)
rm(gene_id_name)
# gc()
# head(dbcan_gut_raw$annotation)
dbcan_gut_formated = dbcan_gut_raw[, c("gene_id","Gene.ID","annotation")] %>% separate_rows(annotation, sep = ",")
names(dbcan_gut_formated)[names(dbcan_gut_formated) == "Gene.ID"] <- "gene_name"
write.table(dbcan_gut_formated, file = opts$output, row.names = FALSE, quote=FALSE, sep="\t")
\ No newline at end of file
...@@ -74,6 +74,10 @@ def get_gene_annotation(dbcan_file: Path): ...@@ -74,6 +74,10 @@ def get_gene_annotation(dbcan_file: Path):
annotation = [item.split("+") for item in line[2:5] if item != "-"] annotation = [item.split("+") for item in line[2:5] if item != "-"]
else: else:
annotation += [item.split("+").split("(")[0] for item in line[2:5] if item != "-"] annotation += [item.split("+").split("(")[0] for item in line[2:5] if item != "-"]
if len(gene) > 0:
annotation = list(chain(*annotation))
annotation = [item.split("(")[0] if "(" else item in item for item in annotation]
yield gene, list(set(annotation))
def main(): def main():
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment