diff --git a/.gitignore b/.gitignore index 54c3d8b5d2d29be09e40448b836440b4761944a4..1549660a14245f977cd974d3e94e44af11a9db55 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,3 @@ Escheria* work .DS_Store report* -bin \ No newline at end of file diff --git a/bin/dbcan2meteor.py b/bin/dbcan2meteor.py new file mode 100644 index 0000000000000000000000000000000000000000..e85394df994b812c67b7ef3a94cdde9df0782f55 --- /dev/null +++ b/bin/dbcan2meteor.py @@ -0,0 +1,45 @@ +import sys +import argparse +import pandas as pd +from pathlib import Path + +__author__ = "Amine Ghozlane" +__copyright__ = "Copyright 2015, Institut Pasteur" +__credits__ = ["Amine Ghozlane"] +__license__ = "GPL" +__version__ = "1.0.0" +__maintainer__ = "Amine Ghozlane" +__email__ = "amine.ghozlane@pasteur.fr" +__status__ = "Developpement" + + +def get_arguments(): + """Retrieves the arguments of the program. + Returns: An object that contains the arguments + """ + # Parsing arguments + parser = argparse.ArgumentParser(description=__doc__, usage= + "{0} -h".format(sys.argv[0])) + parser.add_argument('-i', dest='dbcan_file', type=Path, required=True, + help='Path to the dbcan file.') + parser.add_argument('-a', dest='annotation_file', type=Path, required=True, + default="", help='Path to lite annotation file.') + parser.add_argument('-o', dest='output_file', type=Path, required=True, + help='Output file.') + args = parser.parse_args() + return args + +def main(): + """Main program + """ + args = get_arguments() + dbcan_content = pd.read_csv(args.dbcan_file, delimiter="\t") + annotation_content = pd.read_csv(args.annotation_file, delimiter="\t") + dbcan_content_merged = dbcan_content.merge(annotation_content[["gene_id", "gene_name"]], on="gene_name", how="inner") + dbcan_content_merged = dbcan_content_merged[["gene_id", "gene_name", "annotation"]].sort_values(by=["gene_id"]) + dbcan_content_merged = dbcan_content_merged.groupby("gene_id").head(1) + dbcan_content_merged.to_csv(args.output_file,sep="\t", index=False) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/extract_dbcan.py b/bin/extract_dbcan.py new file mode 100644 index 0000000000000000000000000000000000000000..a82622ef81dfb795e81d9f3b196476e97a24f388 --- /dev/null +++ b/bin/extract_dbcan.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# A copy of the GNU General Public License is available at +# http://www.gnu.org/licenses/gpl-3.0.html + +"""Prepare the catalogues.""" +import argparse +import sys +from pathlib import Path +import csv +from itertools import chain + + +def isfile(path: str) -> Path: # pragma: no cover + """Check if path is an existing file. + + :param path: (str) Path to the file + + :raises ArgumentTypeError: If file does not exist + + :return: (Path) Path object of the input file + """ + myfile = Path(path) + if not myfile.is_file(): + if myfile.is_dir(): + msg = f"{myfile.name} is a directory." + else: + msg = f"{myfile.name} does not exist." + raise argparse.ArgumentTypeError(msg) + return myfile + + +def get_arguments(): + """Retrieves the arguments of the program. + Returns: An object that contains the arguments + """ + # Parsing arguments + parser = argparse.ArgumentParser(description=__doc__, usage= + "{0} -h".format(sys.argv[0])) + parser.add_argument('-i', dest='dbcan_file', type=Path, required=True, + help='Path to the dbcan file.') + parser.add_argument('-o', dest='output_file', type=Path, required=True, + help='Output file.') + args = parser.parse_args() + return args + + +def get_gene_annotation(dbcan_file: Path): + """Extract unique GH annotation + """ + with dbcan_file.open("rt") as dbcan: + dbcan_reader = csv.reader(dbcan, delimiter="\t") + # pass header + next(dbcan_reader) + gene = "" + annotation = [] + for line in dbcan_reader: + # print(line) + if gene != line[0]: + if len(gene) > 0: + # Flatten the list + annotation = list(chain(*annotation)) + annotation = [item.split("(")[0] if "(" else item in item for item in annotation] + yield gene, list(set(annotation)) + gene = line[0] + annotation = [item.split("+") for item in line[2:5] if item != "-"] + else: + annotation += [item.split("+").split("(")[0] for item in line[2:5] if item != "-"] + +def main(): + """ + Main program function + """ + # Get arguments + args = get_arguments() + with args.output_file.open("wt") as output: + print(f"gene_name\tannotation", file=output) + for gene, annotation in get_gene_annotation(args.dbcan_file): + for cazy in annotation: + print(f"{gene}\t{cazy}", file=output) + + +if __name__ == '__main__': + main() \ No newline at end of file