Skip to content
Snippets Groups Projects
Commit 65490369 authored by Amine  GHOZLANE's avatar Amine GHOZLANE
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# A copy of the GNU General Public License is available at
# http://www.gnu.org/licenses/gpl-3.0.html
import argparse
import pandas as pd
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument("metaphlan_file", type=Path)
parser.add_argument("metaphlan_len", type=Path)
parser.add_argument("sample_name", type=str)
parser.add_argument("output_file", type=Path)
args = parser.parse_args()
core_content = pd.read_csv(args.metaphlan_file, delimiter="\t", names=["gene", "count"], low_memory=False )
metaphlan_len = pd.read_csv(args.metaphlan_len, delimiter="\t", compression="gzip", names=["gene", "len"])
core_content = core_content.merge(metaphlan_len[["gene", "len"]],on="gene", how="inner")
core_content["SGB"] = core_content['gene'].str.split('|').str[-1]
#print(core_content)
core_content['len'] = pd.to_numeric(core_content['len'])
core_content['count'] = pd.to_numeric(core_content['count'])
core_content[args.sample_name] = core_content["count"]/core_content["len"]*1000
del core_content["len"]
del core_content["count"]
del core_content["gene"]
#print(core_content)
res = core_content.groupby("SGB")[args.sample_name].mean()
#res = core_content.groupby("SGB").agg("mean")
res = res.round(0).astype(int)
#print(res)
res.to_csv(args.output_file,sep="\t")
import pandas as pd
import argparse
from pathlib import Path
import numpy as np
# create the top-level parser
parser = argparse.ArgumentParser()
parser.add_argument("--sgbtaxo", help="SGB taxonomy (GTDB format)", default=Path("mpa_vOct22_CHOCOPhlAnSGB_202212_SGB2GTDB.tsv.gz"),
type=Path)
parser.add_argument("--keepZeros", help="Do not remove SGB that have a null count in the final matrix",
action="store_false")
parser.add_argument('metaphlan_SBG_agg', type=Path, nargs='+')
parser.add_argument('output_matrix', type=Path, default="matrix.tsv", help="Combine all metaphlan SGB data into a contengency matrix")
parser.add_argument('output_taxo_table', type=Path, default="taxonomy.tsv", help="If sgbtaxo available, output the taxonomy of existing SGB")
args = parser.parse_args()
dataframes = [pd.read_csv(f, delimiter="\t") for f in args.metaphlan_SBG_agg]
# Now you can concatenate them into one DataFrame
df = dataframes[0]
for dataframe in dataframes[1:]:
df = pd.merge(df, dataframe, on='SGB')
# Remove rows with only zeros
if args.keepZeros:
df = df[df.select_dtypes(include=[np.number]).sum(axis=1) != 0]
df.to_csv(args.output_matrix, sep="\t", index=False)
if args.sgbtaxo:
taxo = pd.read_csv(args.sgbtaxo, compression="gzip", delimiter="\t", names=["SGB", "Annotation"])
# print(taxo)
taxo = taxo[taxo['SGB'].isin(df['SGB'])]
taxo['Annotation'] = taxo['Annotation'].str.replace(r'\w__', '', regex=True)
# Split 'Annotation' into multiple columns
# print(taxo['Annotation'].str.split(';', expand=True))
taxo[['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = taxo['Annotation'].str.split(';', expand=True)
# Add 'marker_SGB' column
taxo['marker_SGB'] = taxo['SGB'] + '_' + taxo['Annotation'].apply(lambda x: [i for i in x.split(';') if i][-1])
del taxo['Annotation']
taxo.to_csv(args.output_taxo_table, sep="\t", index=False)
File added
File added
[tool.poetry]
name = "metaphlan-script"
version = "0.1.0"
description = "Convert metaphlan marker_counts output to raw count matrix"
authors = ["Amine Ghozlane <amine.ghozlane@pasteur.fr>"]
license = "GPL-3"
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
pandas = "^2.2.1"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11" or python_version >= "3.12" and python_version < "4.0" \
--hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
--hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
--hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
--hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
--hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
--hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
--hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
--hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
--hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
--hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
--hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
--hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
--hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
--hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
--hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
--hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
--hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
--hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
--hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
--hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
--hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
--hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
--hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
--hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
--hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
--hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
--hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
--hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
--hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
--hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
--hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
--hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
--hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
--hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
--hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
--hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
pandas==2.2.1 ; python_version >= "3.10" and python_version < "4.0" \
--hash=sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee \
--hash=sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e \
--hash=sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572 \
--hash=sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944 \
--hash=sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403 \
--hash=sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89 \
--hash=sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab \
--hash=sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6 \
--hash=sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb \
--hash=sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9 \
--hash=sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019 \
--hash=sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be \
--hash=sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd \
--hash=sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c \
--hash=sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88 \
--hash=sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0 \
--hash=sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397 \
--hash=sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc \
--hash=sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2 \
--hash=sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7 \
--hash=sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06 \
--hash=sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51 \
--hash=sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0 \
--hash=sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a \
--hash=sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16 \
--hash=sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02 \
--hash=sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359 \
--hash=sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b \
--hash=sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df
python-dateutil==2.8.2 ; python_version >= "3.10" and python_version < "4.0" \
--hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \
--hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9
pytz==2024.1 ; python_version >= "3.10" and python_version < "4.0" \
--hash=sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812 \
--hash=sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319
six==1.16.0 ; python_version >= "3.10" and python_version < "4.0" \
--hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
--hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
tzdata==2024.1 ; python_version >= "3.10" and python_version < "4.0" \
--hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \
--hash=sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment