Commit 03017054 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

add script to import gene sequences from fa.gz IGC file

parent 092da880
Pipeline #23845 passed with stages
in 2 minutes and 45 seconds
......@@ -61,6 +61,7 @@ django-pandas = "*"
bioapi = {git = "https://github.com/khillion/bioapi.git"}
django-admin-list-filter-dropdown = "*"
gunicorn = "*"
pyfastx = "*"
[requires]
python_version = "3.7"
This diff is collapsed.
import logging
import pyfastx
from django.core.management.base import BaseCommand
from slugify import slugify
from metagenedb.apps.catalog.models import Gene
logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)
class ImportIGCGeneSequences(object):
def __init__(self, sequence_file):
self.sequence_file = sequence_file
self._reset_counters()
def _reset_counters(self):
self.processed_genes = 0
self.updated_genes = 0
self.skipped_genes = 0
def update_sequences(self, sequences):
genes = Gene.objects.filter(gene_id__in=sequences.keys())
genes_retrieved = genes.count()
for gene in genes:
gene.sequence = sequences[gene.gene_id]
try:
Gene.objects.bulk_update(genes, ['sequence'])
self.updated_genes += genes_retrieved
self.skipped_genes += len(sequences) - genes_retrieved
except Exception:
logger.warning("Could not update genes... skipped.")
self.skipped_genes += len(sequences)
def load_all(self, test=False, chunk_size=10000):
logger.info("Starting IGC Gene sequences import (update) to DB")
current_sequences = {}
for name, seq in pyfastx.Fasta(self.sequence_file, build_index=False):
current_sequences[slugify(name.split()[0])] = seq
self.processed_genes += 1
if self.processed_genes % chunk_size == 0:
self.update_sequences(current_sequences)
logger.info("%s Gene sequences processed so far...", self.processed_genes)
current_sequences = {}
if test is True:
break
if len(current_sequences) > 0:
self.update_sequences(current_sequences)
logger.info("[DONE] %s/%s Gene sequences updated.", self.updated_genes, self.processed_genes)
logger.info("[DONE] %s/%s Genes skipped.", self.skipped_genes, self.processed_genes)
class Command(BaseCommand):
help = 'Create or update all EggNOG entries from annotations.tsv file.'
def add_arguments(self, parser):
parser.add_argument('fasta', help='IGC.fa.gz file from IGC. Genes need to exist in DB for this script to work.')
parser.add_argument('--test', action='store_true', help='Run only on first 10000 sequences.')
def set_logger_level(self, verbosity):
if verbosity > 2:
logger.setLevel(logging.DEBUG)
elif verbosity > 1:
logger.setLevel(logging.INFO)
def handle(self, *args, **options):
self.set_logger_level(int(options['verbosity']))
import_igc = ImportIGCGeneSequences(options['fasta'])
import_igc.load_all(test=options['test'])
from rest_framework.test import APITestCase
from metagenedb.apps.catalog.models import Gene
from metagenedb.apps.catalog.management.commands.import_igc_sequences import ImportIGCGeneSequences
from metagenedb.apps.catalog.factory import (
GeneFactory,
)
class TestUpdateSequences(APITestCase):
@classmethod
def setUpTestData(cls):
cls.gene = GeneFactory()
def setUp(self):
self.import_igc_seq = ImportIGCGeneSequences("test") # we never make real reference to the sequence_file
def test_update_sequence(self):
seq = "ACTG"
sequences = {
self.gene.gene_id: seq
}
self.assertFalse(Gene.objects.get(gene_id=self.gene.gene_id).sequence)
self.import_igc_seq.update_sequences(sequences)
self.assertEqual(Gene.objects.get(gene_id=self.gene.gene_id).sequence, seq)
# Generated by Django 3.0.1 on 2020-02-06 10:05
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('catalog', '0023_add_sequences_and_source_to_gene'),
]
operations = [
migrations.AlterField(
model_name='gene',
name='source',
field=models.CharField(choices=[('undef', 'Undefined'), ('igc', 'IGC')], default='undef', max_length=10),
),
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment