Commit b54a6909 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

allow to fix gene source when computing statistics

parent 509ac089
......@@ -12,7 +12,9 @@ logger = logging.getLogger(__name__)
class ComputeStatistics:
GENE_SOURCES = ['all', 'igc', 'virgo']
def __init__(self, gene_source):
self.gene_source = gene_source
def _save_to_db(self, payload):
try:
......@@ -39,13 +41,13 @@ class ComputeCounts(ComputeStatistics):
]
FUNCTION_SOURCES = ['kegg', 'eggnog']
def compute_count(self, method, gene_source, filters=None, **kwargs):
def compute_count(self, method, filters=None, **kwargs):
if filters is None:
filters = {}
gene_stats = GeneStatistics(filters=filters)
print_kwargs = '-'.join([str(k) + '-' + str(v) for k, v in kwargs.items() if v])
print_filters = '-'.join([str(k) + '-' + str(v) for k, v in filters.items() if v])
stats_id = slugify(f"GeneStatistics({gene_source}).{method}({print_kwargs})")
stats_id = slugify(f"GeneStatistics({self.gene_source}).{method}({print_kwargs})")
logger.info(
"Call GeneStatistics(%s).%s(%s) and saving under id <%s>",
print_filters, method, print_kwargs, stats_id
......@@ -59,15 +61,14 @@ class ComputeCounts(ComputeStatistics):
self._save_to_db(payload)
def all(self):
for gene_source in self.GENE_SOURCES:
if gene_source == 'all':
filters = {}
else:
filters = {'source': gene_source}
for method in self.METHODS:
self.compute_count(method, gene_source, filters=filters)
for source in self.FUNCTION_SOURCES:
self.compute_count('count_has_function', gene_source, filters=filters, source=source)
if self.gene_source == 'all':
filters = {}
else:
filters = {'source': self.gene_source}
for method in self.METHODS:
self.compute_count(method, filters=filters)
for source in self.FUNCTION_SOURCES:
self.compute_count('count_has_function', filters=filters, source=source)
class ComputeGeneLength(ComputeStatistics):
......@@ -87,13 +88,13 @@ class ComputeGeneLength(ComputeStatistics):
},
}
def _compute_gene_length(self, filters, category, gene_source):
def _compute_gene_length(self, filters, category):
gene_stats = GeneLengthDistribution(
window_size=self.MIN_WINDOW_SIZE, stop_at=self.MAX_STOP_AT, filters=filters
)
for window_size in self.WINDOW_SIZES:
for stop_at in self.STOP_ATS:
stats_id = slugify(f"GeneStatistics({gene_source}).gene-length-{window_size}-{stop_at}-{category}")
stats_id = slugify(f"GeneStatistics({self.gene_source}).gene-length-{window_size}-{stop_at}-{category}")
logger.info(
"Call GeneStatistics.gene_length(%s, %s) for %s and saving under id <%s>",
window_size, stop_at, category, stats_id)
......@@ -104,15 +105,14 @@ class ComputeGeneLength(ComputeStatistics):
self._save_to_db(payload)
def all(self):
for gene_source in self.GENE_SOURCES:
if gene_source == 'all':
filters = {}
else:
filters = {'source': gene_source}
for category, cat_filters in self.CATEGORIES.items():
if cat_filters is not None:
filters.update(**cat_filters)
self._compute_gene_length(filters, category, gene_source)
if self.gene_source == 'all':
filters = {}
else:
filters = {'source': self.gene_source}
for category, cat_filters in self.CATEGORIES.items():
if cat_filters is not None:
filters.update(**cat_filters)
self._compute_gene_length(filters, category)
class ComputeTaxonomyRepartition(ComputeStatistics):
......@@ -121,23 +121,22 @@ class ComputeTaxonomyRepartition(ComputeStatistics):
]
def all(self):
for gene_source in self.GENE_SOURCES:
if gene_source == 'all':
filters = {}
else:
filters = {'source': gene_source}
gene_stats = GeneStatistics(filters=filters)
for level in self.ALL_LEVEL:
stats_id = slugify(f"GeneStatistics({gene_source}).taxonomy_repartition({level})")
logger.info(
"Call GeneStatistics.taxonomy_repartition(%s) and saving under id <%s>",
level, stats_id
)
payload = {
'stats_id': stats_id,
'body': gene_stats.taxonomy_repartition(level=level)
}
self._save_to_db(payload)
if self.gene_source == 'all':
filters = {}
else:
filters = {'source': self.gene_source}
gene_stats = GeneStatistics(filters=filters)
for level in self.ALL_LEVEL:
stats_id = slugify(f"GeneStatistics({self.gene_source}).taxonomy_repartition({level})")
logger.info(
"Call GeneStatistics.taxonomy_repartition(%s) and saving under id <%s>",
level, stats_id
)
payload = {
'stats_id': stats_id,
'body': gene_stats.taxonomy_repartition(level=level)
}
self._save_to_db(payload)
class ComputeTaxonomyPresence(ComputeStatistics):
......@@ -146,31 +145,32 @@ class ComputeTaxonomyPresence(ComputeStatistics):
]
def all(self):
for gene_source in self.GENE_SOURCES:
if gene_source == 'all':
filters = {}
else:
filters = {'source': gene_source}
gene_stats = GeneStatistics(filters=filters)
for level in self.ALL_LEVEL:
stats_id = slugify(f"GeneStatistics({gene_source}).present_taxonomy({level})")
logger.info(
"Call GeneStatistics.present_taxonomy(%s) and saving under id <%s>",
level, stats_id
)
payload = {
'stats_id': stats_id,
'body': gene_stats.present_taxonomy(level=level)
}
self._save_to_db(payload)
if self.gene_source == 'all':
filters = {}
else:
filters = {'source': self.gene_source}
gene_stats = GeneStatistics(filters=filters)
for level in self.ALL_LEVEL:
stats_id = slugify(f"GeneStatistics({self.gene_source}).present_taxonomy({level})")
logger.info(
"Call GeneStatistics.present_taxonomy(%s) and saving under id <%s>",
level, stats_id
)
payload = {
'stats_id': stats_id,
'body': gene_stats.present_taxonomy(level=level)
}
self._save_to_db(payload)
class Command(BaseCommand):
help = "Compute gene catalog statistics."
STEP_CHOICES = ['clean', 'counts', 'gene-length', 'taxonomy_repartition', 'taxonomy_presence']
SOURCE_CHOICES = ['all', 'virgo', 'igc']
def add_arguments(self, parser):
parser.add_argument('--only', help=f'Run only one step (choices: {self.STEP_CHOICES}).')
parser.add_argument('--source', help=f'Run only one step (choices: {self.SOURCE_CHOICES}).')
def set_logger_level(self, verbosity):
if verbosity > 2:
......@@ -178,22 +178,38 @@ class Command(BaseCommand):
elif verbosity > 1:
logger.setLevel(logging.INFO)
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
only_step = str(options['only'])
if only_step is not None:
if only_step not in self.STEP_CHOICES:
def _get_and_validate_only_step(self, only_step_str):
if only_step_str is not None:
if only_step_str not in self.STEP_CHOICES:
logger.warning(
"Choice '%s' is not a valid choice. Please choose among %s",
only_step, self.STEP_CHOICES
only_step_str, self.STEP_CHOICES
)
if only_step is None or only_step == "clean":
ComputeStatistics().clean_db()
if only_step is None or only_step == "counts":
ComputeCounts().all()
if only_step is None or only_step == "gene-length":
ComputeGeneLength().all()
if only_step is None or only_step == "taxonomy_repartition":
ComputeTaxonomyRepartition().all()
if only_step is None or only_step == "taxonomy_presence":
ComputeTaxonomyPresence().all()
return only_step_str
def _get_and_validate_source(self, source_str):
if source_str is not None:
if source_str not in self.SOURCE_CHOICES:
logger.warning(
"Choice '%s' is not a valid choice. Please choose among %s",
source_str, self.SOURCE_CHOICES
)
return []
return [source_str]
return self.SOURCE_CHOICES
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
only_step = self._get_and_validate_only_step(options['only'])
gene_sources = self._get_and_validate_source(options['source'])
for gene_source in gene_sources:
if only_step is None or only_step == "clean":
ComputeStatistics(gene_source).clean_db()
if only_step is None or only_step == "counts":
ComputeCounts(gene_source).all()
if only_step is None or only_step == "gene-length":
ComputeGeneLength(gene_source).all()
if only_step is None or only_step == "taxonomy_repartition":
ComputeTaxonomyRepartition(gene_source).all()
if only_step is None or only_step == "taxonomy_presence":
ComputeTaxonomyPresence(gene_source).all()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment