Commit 2553eec8 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

remove notebooks

parent 1cbc7043
......@@ -7,4 +7,5 @@ __pycache__/
web/metagenedb/settings/__init__.py
# Jupyter notebook
notebooks/
.ipynb_checkpoints/
%% Cell type:markdown id: tags:
## Proof of concept
The idea here is to perform loading of all the genes in the annotation file from IGC
%% Cell type:code id: tags:
``` python
import time
# Timer decorator
def timer(function):
def wrapper(*args,**kwargs):
start_time = time.time()
func = function(*args,**kwargs)
_LOGGER.info("[{}] EXECUTED TIME: {} seconds".format(function.__name__, time.time() - start_time))
return func
return wrapper
```
%% Cell type:markdown id: tags:
### Input files
%% Cell type:code id: tags:
``` python
IGC_annotation_path = "/home/khillion/Pasteur/gitlab/metagenedb/dev_data/IGC_sample.annotation_OF.summary"
```
%% Cell type:code id: tags:
``` python
import logging
logging.basicConfig(level=logging.INFO)
_LOGGER = logging.getLogger(__name__)
```
%% Cell type:code id: tags:
``` python
from itertools import islice
from django.core.exceptions import ValidationError
from metagenedb.apps.catalog.models import Gene
def create_gene(raw_line):
gene_info = raw_line.rstrip().split('\t')
gene = Gene(gene_id=gene_info[1],
gene_length=gene_info[2],
taxonomic_genus=gene_info[6],
taxonomic_phylum=gene_info[5])
return gene
def insert_gene(gene):
gene.full_clean()
gene.save()
def insert_gene_list(chunk_genes):
for i in chunk_genes:
try:
gene = create_gene(i)
insert_gene(gene)
except ValidationError as e:
_LOGGER.warning(f"{e.__dict__} for gene_id: {gene.gene_id}. Insertion skipped.")
@timer
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
loaded_genes = 0
with open(annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
loaded_genes += len(chunk_genes)
insert_gene_list(chunk_genes)
_LOGGER.info(f"{loaded_genes} genes processed so far...")
_LOGGER.info(f"[DONE] {loaded_genes} genes processed.")
```
%% Cell type:code id: tags:
``` python
Gene.objects.all().delete()
# load_annotation_file_to_db_in_chunks(IGC_annotation_path)
```
%% Output
(1000, {'catalog.Gene': 1000})
%% Cell type:code id: tags:
``` python
load_annotation_file_to_db_in_chunks(IGC_annotation_path)
```
%% Output
WARNING:__main__:{'error_dict': {'gene_id': [ValidationError(['Gene with this Gene id already exists.'])]}} for gene_id: MH0385_GL0059251. Insertion skipped.
INFO:__main__:1001 genes processed so far...
INFO:__main__:[DONE] 1001 genes processed.
INFO:__main__:[load_annotation_file_to_db_in_chunks] EXECUTED TIME: 3.5965709686279297 seconds
%% Cell type:code id: tags:
``` python
import metagenedb.settings
```
%% Cell type:code id: tags:
``` python
metagenedb.settings.MIDDLEWARE
```
%% Output
['django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'corsheaders.middleware.CorsMiddleware']
%% Cell type:code id: tags:
``` python
metagenedb.settings.INSTALLED_APPS
```
%% Output
['metagenedb.apps.catalog',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'django_extensions',
'corsheaders']
%% Cell type:code id: tags:
``` python
from metagenedb.apps.catalog.models import KeggOrthology
```
%% Cell type:code id: tags:
``` python
KeggOrthology.objects.all()
```
%% Output
<QuerySet []>
%% Cell type:code id: tags:
``` python
a_kegg = KeggOrthology.objects.create_kegg(function_id="K01824")
```
%% Cell type:code id: tags:
``` python
a_kegg.save()
```
%% Cell type:code id: tags:
``` python
import requests
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-4-95039fbd75c1> in <module>
----> 1 import requests
ModuleNotFoundError: No module named 'requests'
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment